From 3c47c2ccd5a29c78780ccfd0227a805f3873ab1c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Jan 2025 07:35:08 -0800 Subject: [PATCH 001/989] nvmet: fix rw control endian access Fixes: 3ec5c62cfcf060e ("nvmet: handle rw's limited retry flag") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501142128.WexgyMTv-lkp@intel.com/ Cc: Guixin Liu Signed-off-by: Keith Busch --- drivers/nvme/target/io-cmd-bdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 6380b60fd4900..2b09b2c69857a 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -272,7 +272,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) iter_flags = SG_MITER_FROM_SG; } - if (req->cmd->rw.control & NVME_RW_LR) + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_LR)) opf |= REQ_FAILFAST_DEV; if (is_pci_p2pdma_page(sg_page(req->sg))) -- GitLab From d68fc95a771e0a7edd876ede7913d61276be77fd Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Fri, 17 Jan 2025 05:12:09 +0530 Subject: [PATCH 002/989] nvme-pci: remove redundant dma frees in hmb The value of size is 0 when there is no dma buffer allocated. The value of i also remains 0. So, no need to free the dma buffer in out_free_bufs. Hence, remove the redundant dma frees. Signed-off-by: Francis Pravin Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fe0795e16e250..a14f3c74b7171 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2153,14 +2153,6 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, return 0; out_free_bufs: - while (--i >= 0) { - size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE; - - dma_free_attrs(dev->dev, size, bufs[i], - le64_to_cpu(descs[i].addr), - DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); - } - kfree(bufs); out_free_descs: dma_free_coherent(dev->dev, descs_size, descs, descs_dma); -- GitLab From dbf2bb1a1319b7c7d8828905378a6696cca6b0f2 Mon Sep 17 00:00:00 2001 From: Georg Gottleuber Date: Mon, 16 Dec 2024 23:28:03 +0100 Subject: [PATCH 003/989] nvme-pci: Add TUXEDO InfinityFlex to Samsung sleep quirk On the TUXEDO InfinityFlex, a Samsung 990 Evo NVMe leads to a high power consumption in s2idle sleep (4 watts). This patch applies 'Force No Simple Suspend' quirk to achieve a sleep with a lower power consumption, typically around 1.4 watts. Signed-off-by: Georg Gottleuber Cc: stable@vger.kernel.org Signed-off-by: Werner Sembach Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a14f3c74b7171..60afffc917b77 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3134,7 +3134,8 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) * because of high power consumption (> 2 Watt) in s2idle * sleep. Only some boards with Intel CPU are affected. */ - if (dmi_match(DMI_BOARD_NAME, "GMxPXxx") || + if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || + dmi_match(DMI_BOARD_NAME, "GMxPXxx") || dmi_match(DMI_BOARD_NAME, "PH4PG31") || dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) -- GitLab From 11cb3529d18514f7d28ad2190533192aedefd761 Mon Sep 17 00:00:00 2001 From: Georg Gottleuber Date: Mon, 16 Dec 2024 23:28:04 +0100 Subject: [PATCH 004/989] nvme-pci: Add TUXEDO IBP Gen9 to Samsung sleep quirk On the TUXEDO InfinityBook Pro Gen9 Intel, a Samsung 990 Evo NVMe leads to a high power consumption in s2idle sleep (4 watts). This patch applies 'Force No Simple Suspend' quirk to achieve a sleep with a lower power consumption, typically around 1.2 watts. Signed-off-by: Georg Gottleuber Cc: stable@vger.kernel.org Signed-off-by: Werner Sembach Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 60afffc917b77..ac708169efed9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3136,6 +3136,7 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) */ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || dmi_match(DMI_BOARD_NAME, "GMxPXxx") || + dmi_match(DMI_BOARD_NAME, "GXxMRXx") || dmi_match(DMI_BOARD_NAME, "PH4PG31") || dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) -- GitLab From ccb7276a6d26d6f8416e315b43b45e15ee7f29e2 Mon Sep 17 00:00:00 2001 From: Andy Strohman Date: Thu, 9 Jan 2025 02:27:56 +0000 Subject: [PATCH 005/989] batman-adv: fix panic during interface removal Reference counting is used to ensure that batadv_hardif_neigh_node and batadv_hard_iface are not freed before/during batadv_v_elp_throughput_metric_update work is finished. But there isn't a guarantee that the hard if will remain associated with a soft interface up until the work is finished. This fixes a crash triggered by reboot that looks like this: Call trace: batadv_v_mesh_free+0xd0/0x4dc [batman_adv] batadv_v_elp_throughput_metric_update+0x1c/0xa4 process_one_work+0x178/0x398 worker_thread+0x2e8/0x4d0 kthread+0xd8/0xdc ret_from_fork+0x10/0x20 (the batadv_v_mesh_free call is misleading, and does not actually happen) I was able to make the issue happen more reliably by changing hardif_neigh->bat_v.metric_work work to be delayed work. This allowed me to track down and confirm the fix. Cc: stable@vger.kernel.org Fixes: c833484e5f38 ("batman-adv: ELP - compute the metric based on the estimated throughput") Signed-off-by: Andy Strohman [sven@narfation.org: prevent entering batadv_v_elp_get_throughput without soft_iface] Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574e6bf5..fbf499bcc6718 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret; + /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!soft_iface) + return BATADV_THROUGHPUT_DEFAULT_VALUE; + /* if the user specified a customised value for this interface, then * return it directly */ @@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10, -- GitLab From 9bcbb6104a344d3526e185ee1e7b985509914e90 Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Tue, 21 Jan 2025 04:40:16 +0000 Subject: [PATCH 006/989] KVM: arm64: Flush hyp bss section after initialization of variables in bss To determine CPU features during initialization, the nVHE hypervisor utilizes sanitized values of the host's CPU features registers. These values, stored in u64 idaa64*_el1_sys_val variables are updated by the kvm_hyp_init_symbols() function at EL1. To ensure EL2 visibility with the MMU off, the data cache needs to be flushed after these updates. However, individually flushing each variable using kvm_flush_dcache_to_poc() is inefficient. These cpu feature variables would be part of the bss section of the hypervisor. Hence, flush the entire bss section of hypervisor once the initialization is complete. Fixes: 6c30bfb18d0b ("KVM: arm64: Add handlers for protected VM System Registers") Suggested-by: Fuad Tabba Signed-off-by: Lokesh Vutla Link: https://lore.kernel.org/r/20250121044016.2219256-1-lokeshvutla@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bcc4f7e926349..0725a0b50a3e9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2400,6 +2400,13 @@ static void kvm_hyp_init_symbols(void) kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; + + /* + * Flush entire BSS since part of its data containing init symbols is read + * while the MMU is off. + */ + kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), + kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) -- GitLab From 3429dd57f0deb1a602c2624a1dd7c4c11b6c4734 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 17 Jan 2025 10:58:52 +0000 Subject: [PATCH 007/989] sched/fair: Fix inaccurate h_nr_runnable accounting with delayed dequeue set_delayed() adjusts cfs_rq->h_nr_runnable for the hierarchy when an entity is delayed irrespective of whether the entity corresponds to a task or a cfs_rq. Consider the following scenario: root / \ A B (*) delayed since B is no longer eligible on root | | Task0 Task1 <--- dequeue_task_fair() - task blocks When Task1 blocks (dequeue_entity() for task's se returns true), dequeue_entities() will continue adjusting cfs_rq->h_nr_* for the hierarchy of Task1. However, when the sched_entity corresponding to cfs_rq B is delayed, set_delayed() will adjust the h_nr_runnable for the hierarchy too leading to both dequeue_entity() and set_delayed() decrementing h_nr_runnable for the dequeue of the same task. A SCHED_WARN_ON() to inspect h_nr_runnable post its update in dequeue_entities() like below: cfs_rq->h_nr_runnable -= h_nr_runnable; SCHED_WARN_ON(((int) cfs_rq->h_nr_runnable) < 0); is consistently tripped when running wakeup intensive workloads like hackbench in a cgroup. This error is self correcting since cfs_rq are per-cpu and cannot migrate. The entitiy is either picked for full dequeue or is requeued when a task wakes up below it. Both those paths call clear_delayed() which again increments h_nr_runnable of the hierarchy without considering if the entity corresponds to a task or not. h_nr_runnable will eventually reflect the correct value however in the interim, the incorrect values can still influence PELT calculation which uses se->runnable_weight or cfs_rq->h_nr_runnable. Since only delayed tasks take the early return path in dequeue_entities() and enqueue_task_fair(), adjust the h_nr_runnable in {set,clear}_delayed() only when a task is delayed as this path skips the h_nr_* update loops and returns early. For entities corresponding to cfs_rq, the h_nr_* update loop in the caller will do the right thing. Fixes: 76f2f783294d ("sched/eevdf: More PELT vs DELAYED_DEQUEUE") Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Tested-by: Swapnil Sapkal Link: https://lkml.kernel.org/r/20250117105852.23908-1-kprateek.nayak@amd.com --- kernel/sched/fair.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26958431deb7a..f4e4d3ed943c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5372,6 +5372,15 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void set_delayed(struct sched_entity *se) { se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -5384,6 +5393,16 @@ static void set_delayed(struct sched_entity *se) static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -- GitLab From b893d7ff853e27aa6000fc4ca12e0ffda3318bfc Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 13 Jan 2025 12:07:57 -0600 Subject: [PATCH 008/989] scsi: core: Add passthrough tests for success and no failure definitions This patch adds scsi_check_passthrough() tests for the cases where a command completes successfully and when the command failed but the caller did not pass in a list of failures. Signed-off-by: Mike Christie Link: https://lore.kernel.org/r/20250113180757.16691-1-michael.christie@oracle.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib_test.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/scsi/scsi_lib_test.c b/drivers/scsi/scsi_lib_test.c index 99834426a100a..ae8af0e0047a8 100644 --- a/drivers/scsi/scsi_lib_test.c +++ b/drivers/scsi/scsi_lib_test.c @@ -67,6 +67,13 @@ static void scsi_lib_test_multiple_sense(struct kunit *test) }; int i; + /* Success */ + sc.result = 0; + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, &failures)); + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, NULL)); + /* Command failed but caller did not pass in a failures array */ + scsi_build_sense(&sc, 0, ILLEGAL_REQUEST, 0x91, 0x36); + KUNIT_EXPECT_EQ(test, 0, scsi_check_passthrough(&sc, NULL)); /* Match end of array */ scsi_build_sense(&sc, 0, ILLEGAL_REQUEST, 0x91, 0x36); KUNIT_EXPECT_EQ(test, -EAGAIN, scsi_check_passthrough(&sc, &failures)); -- GitLab From 1b3e2d4ec0c5848776cc56d2624998aa5b2f0d27 Mon Sep 17 00:00:00 2001 From: "Bao D. Nguyen" Date: Mon, 13 Jan 2025 10:32:07 -0800 Subject: [PATCH 009/989] scsi: ufs: core: Fix the HIGH/LOW_TEMP Bit Definitions According to the UFS Device Specification, the dExtendedUFSFeaturesSupport defines the support for TOO_HIGH_TEMPERATURE as bit[4] and the TOO_LOW_TEMPERATURE as bit[5]. Correct the code to match with the UFS device specification definition. Cc: stable@vger.kernel.org Fixes: e88e2d32200a ("scsi: ufs: core: Probe for temperature notification support") Signed-off-by: Bao D. Nguyen Link: https://lore.kernel.org/r/69992b3e3e3434a5c7643be5a64de48be892ca46.1736793068.git.quic_nguyenb@quicinc.com Reviewed-by: Avri Altman Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- include/ufs/ufs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index 89672ad8c3bb0..f151feb0ca8c7 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -385,8 +385,8 @@ enum { /* Possible values for dExtendedUFSFeaturesSupport */ enum { - UFS_DEV_LOW_TEMP_NOTIF = BIT(4), - UFS_DEV_HIGH_TEMP_NOTIF = BIT(5), + UFS_DEV_HIGH_TEMP_NOTIF = BIT(4), + UFS_DEV_LOW_TEMP_NOTIF = BIT(5), UFS_DEV_EXT_TEMP_NOTIF = BIT(6), UFS_DEV_HPB_SUPPORT = BIT(7), UFS_DEV_WRITE_BOOSTER_SUP = BIT(8), -- GitLab From c9d2782988df354b5a2db00be93920b4ecdde7a2 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Tue, 14 Jan 2025 10:50:41 +0800 Subject: [PATCH 010/989] scsi: target: core: Add line break to status show To ensure the output is not tangled with the shell prompt, add a line break to clearly display the status. Signed-off-by: Guixin Liu Link: https://lore.kernel.org/r/20250114025041.97301-1-kanie@linux.alibaba.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c index c42cbde8a31b3..210648a0092e2 100644 --- a/drivers/target/target_core_stat.c +++ b/drivers/target/target_core_stat.c @@ -117,9 +117,9 @@ static ssize_t target_stat_tgt_status_show(struct config_item *item, char *page) { if (to_stat_tgt_dev(item)->export_count) - return snprintf(page, PAGE_SIZE, "activated"); + return snprintf(page, PAGE_SIZE, "activated\n"); else - return snprintf(page, PAGE_SIZE, "deactivated"); + return snprintf(page, PAGE_SIZE, "deactivated\n"); } static ssize_t target_stat_tgt_non_access_lus_show(struct config_item *item, -- GitLab From 8c09f612b2937da109ed0df583ace3a29fc95a93 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Tue, 14 Jan 2025 20:12:05 +0200 Subject: [PATCH 011/989] scsi: ufs: core: Simplify temperature exception event handling This commit simplifies the temperature exception event handling by removing the ufshcd_temp_exception_event_handler() function and directly calling ufs_hwmon_notify_event() in ufshcd_exception_event_handler(). The ufshcd_temp_exception_event_handler() function contained a placeholder comment for platform vendors to add additional steps if required. However, since its introduction a few years ago, no vendor has added any additional steps. Therefore, the placeholder function is removed to streamline the code. Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20250114181205.153760-1-avri.altman@wdc.com Reviewed-by: Bean Huo Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0920a443588c2..f6c38cf103820 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5976,24 +5976,6 @@ static void ufshcd_bkops_exception_event_handler(struct ufs_hba *hba) __func__, err); } -static void ufshcd_temp_exception_event_handler(struct ufs_hba *hba, u16 status) -{ - u32 value; - - if (ufshcd_query_attr_retry(hba, UPIU_QUERY_OPCODE_READ_ATTR, - QUERY_ATTR_IDN_CASE_ROUGH_TEMP, 0, 0, &value)) - return; - - dev_info(hba->dev, "exception Tcase %d\n", value - 80); - - ufs_hwmon_notify_event(hba, status & MASK_EE_URGENT_TEMP); - - /* - * A placeholder for the platform vendors to add whatever additional - * steps required - */ -} - static int __ufshcd_wb_toggle(struct ufs_hba *hba, bool set, enum flag_idn idn) { u8 index; @@ -6214,7 +6196,7 @@ static void ufshcd_exception_event_handler(struct work_struct *work) ufshcd_bkops_exception_event_handler(hba); if (status & hba->ee_drv_mask & MASK_EE_URGENT_TEMP) - ufshcd_temp_exception_event_handler(hba, status); + ufs_hwmon_notify_event(hba, status & MASK_EE_URGENT_TEMP); ufs_debugfs_exception_event(hba, status); } -- GitLab From e7e34ffc976aaae4f465b7898303241b81ceefc3 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 20 Jan 2025 20:35:28 +0100 Subject: [PATCH 012/989] batman-adv: Ignore neighbor throughput metrics in error case If a temporary error happened in the evaluation of the neighbor throughput information, then the invalid throughput result should not be stored in the throughtput EWMA. Cc: stable@vger.kernel.org Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 50 ++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index fbf499bcc6718..65e52de52bcd2 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -59,11 +59,13 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained + * @pthroughput: calculated throughput towards the given neighbour in multiples + * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). * - * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * Return: true when value behind @pthroughput was set */ -static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, + u32 *pthroughput) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; struct net_device *soft_iface = hard_iface->soft_iface; @@ -77,14 +79,16 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * batman-adv interface */ if (!soft_iface) - return BATADV_THROUGHPUT_DEFAULT_VALUE; + return false; /* if the user specified a customised value for this interface, then * return it directly */ throughput = atomic_read(&hard_iface->bat_v.throughput_override); - if (throughput != 0) - return throughput; + if (throughput != 0) { + *pthroughput = throughput; + return true; + } /* if this is a wireless device, then ask its throughput through * cfg80211 API @@ -111,19 +115,24 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * possible to delete this neighbor. For now set * the throughput metric to 0. */ - return 0; + *pthroughput = 0; + return true; } if (ret) goto default_throughput; - if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + *pthroughput = sinfo.expected_throughput / 100; + return true; + } /* try to estimate the expected throughput based on reported tx * rates */ - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + return true; + } goto default_throughput; } @@ -142,8 +151,10 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && throughput != SPEED_UNKNOWN) - return throughput * 10; + if (throughput && throughput != SPEED_UNKNOWN) { + *pthroughput = throughput * 10; + return true; + } } default_throughput: @@ -157,7 +168,8 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) } /* if none of the above cases apply, return the base_throughput */ - return BATADV_THROUGHPUT_DEFAULT_VALUE; + *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; + return true; } /** @@ -169,15 +181,21 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work) { struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; struct batadv_hardif_neigh_node *neigh; + u32 throughput; + bool valid; neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, metric_work); neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, bat_v); - ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) + goto put_neigh; + + ewma_throughput_add(&neigh->bat_v.throughput, throughput); +put_neigh: /* decrement refcounter to balance increment performed before scheduling * this task */ -- GitLab From 8c8ecc98f5c65947b0070a24bac11e12e47cc65d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 20 Jan 2025 00:06:11 +0100 Subject: [PATCH 013/989] batman-adv: Drop unmanaged ELP metric worker The ELP worker needs to calculate new metric values for all neighbors "reachable" over an interface. Some of the used metric sources require locks which might need to sleep. This sleep is incompatible with the RCU list iterator used for the recorded neighbors. The initial approach to work around of this problem was to queue another work item per neighbor and then run this in a new context. Even when this solved the RCU vs might_sleep() conflict, it has a major problems: Nothing was stopping the work item in case it is not needed anymore - for example because one of the related interfaces was removed or the batman-adv module was unloaded - resulting in potential invalid memory accesses. Directly canceling the metric worker also has various problems: * cancel_work_sync for a to-be-deactivated interface is called with rtnl_lock held. But the code in the ELP metric worker also tries to use rtnl_lock() - which will never return in this case. This also means that cancel_work_sync would never return because it is waiting for the worker to finish. * iterating over the neighbor list for the to-be-deactivated interface is currently done using the RCU specific methods. Which means that it is possible to miss items when iterating over it without the associated spinlock - a behaviour which is acceptable for a periodic metric check but not for a cleanup routine (which must "stop" all still running workers) The better approch is to get rid of the per interface neighbor metric worker and handle everything in the interface worker. The original problems are solved by: * creating a list of neighbors which require new metric information inside the RCU protected context, gathering the metric according to the new list outside the RCU protected context * only use rcu_trylock inside metric gathering code to avoid a deadlock when the cancel_delayed_work_sync is called in the interface removal code (which is called with the rtnl_lock held) Cc: stable@vger.kernel.org Fixes: c833484e5f38 ("batman-adv: ELP - compute the metric based on the estimated throughput") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 2 -- net/batman-adv/bat_v_elp.c | 71 ++++++++++++++++++++++++++------------ net/batman-adv/bat_v_elp.h | 2 -- net/batman-adv/types.h | 3 -- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f08db0f..d35479c465e2c 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); } /** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 65e52de52bcd2..b065578b4436e 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +43,18 @@ #include "routing.h" #include "send.h" +/** + * struct batadv_v_metric_queue_entry - list of hardif neighbors which require + * and metric update + */ +struct batadv_v_metric_queue_entry { + /** @hardif_neigh: hardif neighbor scheduled for metric update */ + struct batadv_hardif_neigh_node *hardif_neigh; + + /** @list: list node for metric_queue */ + struct list_head list; +}; + /** * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset @@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, goto default_throughput; } + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); if (ret == 0) { @@ -175,31 +196,19 @@ static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, /** * batadv_v_elp_throughput_metric_update() - worker updating the throughput * metric of a single hop neighbour - * @work: the work queue item + * @neigh: the neighbour to probe */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void +batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; u32 throughput; bool valid; - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); - valid = batadv_v_elp_get_throughput(neigh, &throughput); if (!valid) - goto put_neigh; + return; ewma_throughput_add(&neigh->bat_v.throughput, throughput); - -put_neigh: - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); } /** @@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) atomic_inc(&hard_iface->bat_v.elp_seqno); + INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) /* Reading the estimated throughput from cfg80211 is a task that * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. + * context. Therefore add it to metric_queue and process it + * outside rcu protected context. */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock(); + list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e2740195fa2d..c9cb0a3071004 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -10,7 +10,6 @@ #include "main.h" #include -#include int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); @@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work); #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 04f6398b3a40e..85a50096f5b24 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; }; /** -- GitLab From 3fafa6a02be219ddd05d6201911534a34135cb82 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 20 Jan 2025 15:35:01 +0100 Subject: [PATCH 014/989] dt-bindings: interrupt-controller: microchip,lan966x-oic: Clarify endpoint use Reword the description, to make it clear that the LAN966x Outbound Interrupt Controller is used only in PCI endpoint mode. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Acked-by: Krzysztof Kozlowski Acked-by: Herve Codina Link: https://lore.kernel.org/all/247b1185c93610100f3f8c9e0ab2c1506e53e1f4.1737383314.git.geert+renesas@glider.be --- .../bindings/interrupt-controller/microchip,lan966x-oic.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml b/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml index b2adc71741770..dca16e202da99 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/microchip,lan966x-oic.yaml @@ -14,9 +14,8 @@ allOf: description: | The Microchip LAN966x outband interrupt controller (OIC) maps the internal - interrupt sources of the LAN966x device to an external interrupt. - When the LAN966x device is used as a PCI device, the external interrupt is - routed to the PCI interrupt. + interrupt sources of the LAN966x device to a PCI interrupt when the LAN966x + device is used as a PCI device. properties: compatible: -- GitLab From e06c9e3682f58fbeb632b7b866bb4fe66a4a4b42 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 20 Jan 2025 15:35:02 +0100 Subject: [PATCH 015/989] irqchip/lan966x-oic: Make CONFIG_LAN966X_OIC depend on CONFIG_MCHP_LAN966X_PCI The Microchip LAN966x outband interrupt controller is only present on Microchip LAN966x SoCs, and only used in PCI endpoint mode. Hence add a dependency on MCHP_LAN966X_PCI, to prevent asking the user about this driver when configuring a kernel without Microchip LAN966x PCIe support. Fixes: 3e3a7b35332924c8 ("irqchip: Add support for LAN966x OIC") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Acked-by: Herve Codina Link: https://lore.kernel.org/all/28e8a605e72ee45e27f0d06b2b71366159a9c782.1737383314.git.geert+renesas@glider.be --- drivers/irqchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index be063bfb50c4b..c11b9965c4ad9 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -169,6 +169,7 @@ config IXP4XX_IRQ config LAN966X_OIC tristate "Microchip LAN966x OIC Support" + depends on MCHP_LAN966X_PCI || COMPILE_TEST select GENERIC_IRQ_CHIP select IRQ_DOMAIN help -- GitLab From d3d380eded7ee5fc2fc53b3b0e72365ded025c4a Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 9 Jan 2025 14:30:47 +0100 Subject: [PATCH 016/989] nvme-fc: go straight to connecting state when initializing The initial controller initialization mimiks the reconnect loop behavior by switching from NEW to RESETTING and then to CONNECTING. The transition from NEW to CONNECTING is a valid transition, so there is no point entering the RESETTING state. TCP and RDMA also transition directly to CONNECTING state. Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 094be164ffdc0..7409da42b9ee5 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3578,8 +3578,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || - !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); goto fail_ctrl; -- GitLab From 294b2b7516fd06a8dd82e4a6118f318ec521e706 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 9 Jan 2025 14:30:48 +0100 Subject: [PATCH 017/989] nvme: handle connectivity loss in nvme_set_queue_count When the set feature attempts fails with any NVME status code set in nvme_set_queue_count, the function still report success. Though the numbers of queues set to 0. This is done to support controllers in degraded state (the admin queue is still up and running but no IO queues). Though there is an exception. When nvme_set_features reports an host path error, nvme_set_queue_count should propagate this error as the connectivity is lost, which means also the admin queue is not working anymore. Fixes: 9a0be7abb62f ("nvme: refactor set_queue_count") Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0d21258e22833..2bcd9f710cb65 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1695,7 +1695,13 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, &result); - if (status < 0) + + /* + * It's either a kernel error or the host observed a connection + * lost. In either case it's not possible communicate with the + * controller and thus enter the error code path. + */ + if (status < 0 || status == NVME_SC_HOST_PATH_ERROR) return status; /* -- GitLab From ee59e3820ca92a9f4307ae23dfc7229dc8b8d400 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Thu, 9 Jan 2025 14:30:49 +0100 Subject: [PATCH 018/989] nvme-fc: do not ignore connectivity loss during connecting When a connectivity loss occurs while nvme_fc_create_assocation is being executed, it's possible that the ctrl ends up stuck in the LIVE state: 1) nvme nvme10: NVME-FC{10}: create association : ... 2) nvme nvme10: NVME-FC{10}: controller connectivity lost. Awaiting Reconnect nvme nvme10: queue_size 128 > ctrl maxcmd 32, reducing to maxcmd 3) nvme nvme10: Could not set queue count (880) nvme nvme10: Failed to configure AEN (cfg 900) 4) nvme nvme10: NVME-FC{10}: controller connect complete 5) nvme nvme10: failed nvme_keep_alive_end_io error=4 A connection attempt starts 1) and the ctrl is in state CONNECTING. Shortly after the LLDD driver detects a connection lost event and calls nvme_fc_ctrl_connectivity_loss 2). Because we are still in CONNECTING state, this event is ignored. nvme_fc_create_association continues to run in parallel and tries to communicate with the controller and these commands will fail. Though these errors are filtered out, e.g in 3) setting the I/O queues numbers fails which leads to an early exit in nvme_fc_create_io_queues. Because the number of IO queues is 0 at this point, there is nothing left in nvme_fc_create_association which could detected the connection drop. Thus the ctrl enters LIVE state 4). Eventually the keep alive handler times out 5) but because nothing is being done, the ctrl stays in LIVE state. There is already the ASSOC_FAILED flag to track connectivity loss event but this bit is set too late in the recovery code path. Move this into the connectivity loss event handler and synchronize it with the state change. This ensures that the ASSOC_FAILED flag is seen by nvme_fc_create_io_queues and it does not enter the LIVE state after a connectivity loss event. If the connectivity loss event happens after we entered the LIVE state the normal error recovery path is executed. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 7409da42b9ee5..55884d3df6f29 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -781,11 +781,19 @@ nvme_fc_abort_lsops(struct nvme_fc_rport *rport) static void nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) { + enum nvme_ctrl_state state; + unsigned long flags; + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost. Awaiting " "Reconnect", ctrl->cnum); - switch (nvme_ctrl_state(&ctrl->ctrl)) { + spin_lock_irqsave(&ctrl->lock, flags); + set_bit(ASSOC_FAILED, &ctrl->flags); + state = nvme_ctrl_state(&ctrl->ctrl); + spin_unlock_irqrestore(&ctrl->lock, flags); + + switch (state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: /* @@ -2542,7 +2550,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) */ if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { __nvme_fc_abort_outstanding_ios(ctrl, true); - set_bit(ASSOC_FAILED, &ctrl->flags); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport error during (re)connect\n", ctrl->cnum); @@ -3167,12 +3174,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } - if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) - ret = -EIO; if (ret) goto out_term_aen_ops; - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + spin_lock_irqsave(&ctrl->lock, flags); + if (!test_bit(ASSOC_FAILED, &ctrl->flags)) + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + else + ret = -EIO; + spin_unlock_irqrestore(&ctrl->lock, flags); + + if (ret) + goto out_term_aen_ops; ctrl->ctrl.nr_reconnects = 0; -- GitLab From 27af31e44949fa85550176520ef7086a0d00fd7b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 16 Jan 2025 18:07:45 +0200 Subject: [PATCH 019/989] hrtimers: Mark is_migration_base() with __always_inline When is_migration_base() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: kernel/time/hrtimer.c:156:20: error: unused function 'is_migration_base' [-Werror,-Wunused-function] 156 | static inline bool is_migration_base(struct hrtimer_clock_base *base) | ^~~~~~~~~~~~~~~~~ Fix this by marking it with __always_inline. [ tglx: Use __always_inline instead of __maybe_unused and move it into the usage sites conditional ] Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250116160745.243358-1-andriy.shevchenko@linux.intel.com --- kernel/time/hrtimer.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f6d8df94045c9..4fb81f8c6f1c7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -145,11 +145,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -275,11 +270,6 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -1370,6 +1360,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was -- GitLab From 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 18 Jan 2025 00:24:33 +0100 Subject: [PATCH 020/989] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING hrtimers are migrated away from the dying CPU to any online target at the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers handling tasks involved in the CPU hotplug forward progress. However wakeups can still be performed by the outgoing CPU after CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being armed. Depending on several considerations (crystal ball power management based election, earliest timer already enqueued, timer migration enabled or not), the target may eventually be the current CPU even if offline. If that happens, the timer is eventually ignored. The most notable example is RCU which had to deal with each and every of those wake-ups by deferring them to an online CPU, along with related workarounds: _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) The problem isn't confined to RCU though as the stop machine kthread (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end of its work through cpu_stop_signal_done() and performs a wake up that eventually arms the deadline server timer: WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 Call Trace: start_dl_timer enqueue_dl_entity dl_server_start enqueue_task_fair enqueue_task ttwu_do_activate try_to_wake_up complete cpu_stopper_thread Instead of providing yet another bandaid to work around the situation, fix it in the hrtimers infrastructure instead: always migrate away a timer to an online target whenever it is enqueued from an offline CPU. This will also allow to revert all the above RCU disgraceful hacks. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Vlad Poenaru Reported-by: Usama Arif Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Tested-by: Paul E. McKenney Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org Closes: 20241213203739.1519801-1-usamaarif642@gmail.com --- include/linux/hrtimer_defs.h | 1 + kernel/time/hrtimer.c | 103 ++++++++++++++++++++++++++++------- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index c3b4b7ed7c163..84a5045f80f36 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -125,6 +125,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4fb81f8c6f1c7..deb1aa32814e3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -178,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -249,8 +287,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -259,8 +297,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -706,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1195,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1206,9 +1242,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; + /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the @@ -1238,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid -- GitLab From 93c66fbc280747ea700bd6199633d661e3c819b3 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Fri, 10 Jan 2025 10:05:54 +0900 Subject: [PATCH 021/989] powercap: call put_device() on an error path in powercap_register_control_type() powercap_register_control_type() calls device_register(), but does not release the refcount of the device when it fails. Call put_device() before returning an error to balance the refcount. Since the kfree(control_type) will be done by powercap_release(), remove the lines in powercap_register_control_type() before returning the error. This bug was found by an experimental verifier that I am developing. Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20250110010554.1583411-1-joe@pf.is.s.u-tokyo.ac.jp [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/powercap_sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index 52c32dcbf7d84..4112a00973382 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -627,8 +627,7 @@ struct powercap_control_type *powercap_register_control_type( dev_set_name(&control_type->dev, "%s", name); result = device_register(&control_type->dev); if (result) { - if (control_type->allocated) - kfree(control_type); + put_device(&control_type->dev); return ERR_PTR(result); } idr_init(&control_type->idr); -- GitLab From a216542027b892e6651c1b4e076012140d04afaf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Jan 2025 15:22:24 +0000 Subject: [PATCH 022/989] btrfs: fix lockdep splat while merging a relocation root When COWing a relocation tree path, at relocation.c:replace_path(), we can trigger a lockdep splat while we are in the btrfs_search_slot() call against the relocation root. This happens in that callchain at ctree.c:read_block_for_search() when we happen to find a child extent buffer already loaded through the fs tree with a lockdep class set to the fs tree. So when we attempt to lock that extent buffer through a relocation tree we have to reset the lockdep class to the class for a relocation tree, since a relocation tree has extent buffers that used to belong to a fs tree and may currently be already loaded (we swap extent buffers between the two trees at the end of replace_path()). However we are missing calls to btrfs_maybe_reset_lockdep_class() to reset the lockdep class at ctree.c:read_block_for_search() before we read lock an extent buffer, just like we did for btrfs_search_slot() in commit b40130b23ca4 ("btrfs: fix lockdep splat with reloc root extent buffers"). So add the missing btrfs_maybe_reset_lockdep_class() calls before the attempts to read lock an extent buffer at ctree.c:read_block_for_search(). The lockdep splat was reported by syzbot and it looks like this: ====================================================== WARNING: possible circular locking dependency detected 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Not tainted ------------------------------------------------------ syz.0.0/5335 is trying to acquire lock: ffff8880545dbc38 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 but task is already holding lock: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-treloc-02/1){+.+.}-{4:4}: reacquire_held_locks+0x3eb/0x690 kernel/locking/lockdep.c:5374 __lock_release kernel/locking/lockdep.c:5563 [inline] lock_release+0x396/0xa30 kernel/locking/lockdep.c:5870 up_write+0x79/0x590 kernel/locking/rwsem.c:1629 btrfs_force_cow_block+0x14b3/0x1fd0 fs/btrfs/ctree.c:660 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (btrfs-tree-01/1){+.+.}-{4:4}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_write_nested+0xa2/0x220 kernel/locking/rwsem.c:1693 btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 btrfs_init_new_buffer fs/btrfs/extent-tree.c:5052 [inline] btrfs_alloc_tree_block+0x41c/0x1440 fs/btrfs/extent-tree.c:5132 btrfs_force_cow_block+0x526/0x1fd0 fs/btrfs/ctree.c:573 btrfs_cow_block+0x371/0x830 fs/btrfs/ctree.c:755 btrfs_search_slot+0xc01/0x3180 fs/btrfs/ctree.c:2153 btrfs_insert_empty_items+0x9c/0x1a0 fs/btrfs/ctree.c:4351 btrfs_insert_empty_item fs/btrfs/ctree.h:688 [inline] btrfs_insert_inode_ref+0x2bb/0xf80 fs/btrfs/inode-item.c:330 btrfs_rename_exchange fs/btrfs/inode.c:7990 [inline] btrfs_rename2+0xcb7/0x2b90 fs/btrfs/inode.c:8374 vfs_rename+0xbdb/0xf00 fs/namei.c:5067 do_renameat2+0xd94/0x13f0 fs/namei.c:5224 __do_sys_renameat2 fs/namei.c:5258 [inline] __se_sys_renameat2 fs/namei.c:5255 [inline] __x64_sys_renameat2+0xce/0xe0 fs/namei.c:5255 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (btrfs-tree-01){++++}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: btrfs-tree-01 --> btrfs-tree-01/1 --> btrfs-treloc-02/1 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-treloc-02/1); lock(btrfs-tree-01/1); lock(btrfs-treloc-02/1); rlock(btrfs-tree-01); *** DEADLOCK *** 8 locks held by syz.0.0/5335: #0: ffff88801e3ae420 (sb_writers#13){.+.+}-{0:0}, at: mnt_want_write_file+0x5e/0x200 fs/namespace.c:559 #1: ffff888052c760d0 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: __btrfs_balance+0x4c2/0x26b0 fs/btrfs/volumes.c:4183 #2: ffff888052c74850 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x775/0xd90 fs/btrfs/relocation.c:4086 #3: ffff88801e3ae610 (sb_internal#2){.+.+}-{0:0}, at: merge_reloc_root+0xf11/0x1ad0 fs/btrfs/relocation.c:1659 #4: ffff888052c76470 (btrfs_trans_num_writers){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #5: ffff888052c76498 (btrfs_trans_num_extwriters){++++}-{0:0}, at: join_transaction+0x405/0xda0 fs/btrfs/transaction.c:288 #6: ffff8880545db878 (btrfs-tree-01/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 #7: ffff8880545dba58 (btrfs-treloc-02/1){+.+.}-{4:4}, at: btrfs_tree_lock_nested+0x2f/0x250 fs/btrfs/locking.c:189 stack backtrace: CPU: 0 UID: 0 PID: 5335 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 down_read_nested+0xb5/0xa50 kernel/locking/rwsem.c:1649 btrfs_tree_read_lock_nested+0x2f/0x250 fs/btrfs/locking.c:146 btrfs_tree_read_lock fs/btrfs/locking.h:188 [inline] read_block_for_search+0x718/0xbb0 fs/btrfs/ctree.c:1610 btrfs_search_slot+0x1274/0x3180 fs/btrfs/ctree.c:2237 replace_path+0x1243/0x2740 fs/btrfs/relocation.c:1224 merge_reloc_root+0xc46/0x1ad0 fs/btrfs/relocation.c:1692 merge_reloc_roots+0x3b3/0x980 fs/btrfs/relocation.c:1942 relocate_block_group+0xb0a/0xd40 fs/btrfs/relocation.c:3754 btrfs_relocate_block_group+0x77d/0xd90 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3494 __btrfs_balance+0x1b0f/0x26b0 fs/btrfs/volumes.c:4278 btrfs_balance+0xbdc/0x10c0 fs/btrfs/volumes.c:4655 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1ac6985d29 Code: ff ff c3 (...) RSP: 002b:00007f1ac63fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f1ac6b76160 RCX: 00007f1ac6985d29 RDX: 0000000020000180 RSI: 00000000c4009420 RDI: 0000000000000007 RBP: 00007f1ac6a01b08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000001 R14: 00007f1ac6b76160 R15: 00007fffda145a88 Reported-by: syzbot+63913e558c084f7f8fdc@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/677b3014.050a0220.3b53b0.0064.GAE@google.com/ Fixes: 99785998ed1c ("btrfs: reduce lock contention when eb cache miss for btree search") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 92071ca0655f0..3dc5a35dd19b3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1496,6 +1496,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { btrfs_unlock_up_safe(p, parent_level + 1); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); @@ -1539,6 +1540,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (!p->skip_locking) { ASSERT(ret == -EAGAIN); + btrfs_maybe_reset_lockdep_class(root, tmp); tmp_locked = true; btrfs_tree_read_lock(tmp); btrfs_release_path(p); -- GitLab From 0d85f5c2dd91df6b5da454406756f463ba923b69 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Jan 2025 15:01:08 +0000 Subject: [PATCH 023/989] btrfs: fix assertion failure when splitting ordered extent after transaction abort If while we are doing a direct IO write a transaction abort happens, we mark all existing ordered extents with the BTRFS_ORDERED_IOERR flag (done at btrfs_destroy_ordered_extents()), and then after that if we enter btrfs_split_ordered_extent() and the ordered extent has bytes left (meaning we have a bio that doesn't cover the whole ordered extent, see details at btrfs_extract_ordered_extent()), we will fail on the following assertion at btrfs_split_ordered_extent(): ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); because the BTRFS_ORDERED_IOERR flag is set and the definition of BTRFS_ORDERED_TYPE_FLAGS is just the union of all flags that identify the type of write (regular, nocow, prealloc, compressed, direct IO, encoded). Fix this by returning an error from btrfs_extract_ordered_extent() if we find the BTRFS_ORDERED_IOERR flag in the ordered extent. The error will be the error that resulted in the transaction abort or -EIO if no transaction abort happened. This was recently reported by syzbot with the following trace: FAULT_INJECTION: forcing a failure. name failslab, interval 1, probability 0, space 0, times 1 CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 fail_dump lib/fault-inject.c:53 [inline] should_fail_ex+0x3b0/0x4e0 lib/fault-inject.c:154 should_failslab+0xac/0x100 mm/failslab.c:46 slab_pre_alloc_hook mm/slub.c:4072 [inline] slab_alloc_node mm/slub.c:4148 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0xdd/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] btrfs_chunk_alloc_add_chunk_item+0x244/0x1100 fs/btrfs/volumes.c:5742 reserve_chunk_space+0x1ca/0x2c0 fs/btrfs/block-group.c:4292 check_system_chunk fs/btrfs/block-group.c:4319 [inline] do_chunk_alloc fs/btrfs/block-group.c:3891 [inline] btrfs_chunk_alloc+0x77b/0xf80 fs/btrfs/block-group.c:4187 find_free_extent_update_loop fs/btrfs/extent-tree.c:4166 [inline] find_free_extent+0x42d1/0x5810 fs/btrfs/extent-tree.c:4579 btrfs_reserve_extent+0x422/0x810 fs/btrfs/extent-tree.c:4672 btrfs_new_extent_direct fs/btrfs/direct-io.c:186 [inline] btrfs_get_blocks_direct_write+0x706/0xfa0 fs/btrfs/direct-io.c:321 btrfs_dio_iomap_begin+0xbb7/0x1180 fs/btrfs/direct-io.c:525 iomap_iter+0x697/0xf60 fs/iomap/iter.c:90 __iomap_dio_rw+0xeb9/0x25b0 fs/iomap/direct-io.c:702 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 BTRFS error (device loop0 state A): Transaction aborted (error -12) BTRFS: error (device loop0 state A) in btrfs_chunk_alloc_add_chunk_item:5745: errno=-12 Out of memory BTRFS info (device loop0 state EA): forced readonly assertion failed: !(flags & ~BTRFS_ORDERED_TYPE_FLAGS), in fs/btrfs/ordered-data.c:1234 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ordered-data.c:1234! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 UID: 0 PID: 5321 Comm: syz.0.0 Not tainted 6.13.0-rc5-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_extract_ordered_extent fs/btrfs/direct-io.c:702 [inline] btrfs_dio_submit_io+0x4be/0x6d0 fs/btrfs/direct-io.c:737 iomap_dio_submit_bio fs/iomap/direct-io.c:85 [inline] iomap_dio_bio_iter+0x1022/0x1740 fs/iomap/direct-io.c:447 __iomap_dio_rw+0x13b7/0x25b0 fs/iomap/direct-io.c:703 btrfs_dio_write fs/btrfs/direct-io.c:775 [inline] btrfs_direct_write+0x610/0xa30 fs/btrfs/direct-io.c:880 btrfs_do_write_iter+0x2a0/0x760 fs/btrfs/file.c:1397 do_iter_readv_writev+0x600/0x880 vfs_writev+0x376/0xba0 fs/read_write.c:1050 do_pwritev fs/read_write.c:1146 [inline] __do_sys_pwritev2 fs/read_write.c:1204 [inline] __se_sys_pwritev2+0x196/0x2b0 fs/read_write.c:1195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1281f85d29 RSP: 002b:00007f12819fe038 EFLAGS: 00000246 ORIG_RAX: 0000000000000148 RAX: ffffffffffffffda RBX: 00007f1282176080 RCX: 00007f1281f85d29 RDX: 0000000000000001 RSI: 0000000020000240 RDI: 0000000000000005 RBP: 00007f12819fe090 R08: 0000000000000000 R09: 0000000000000003 R10: 0000000000007000 R11: 0000000000000246 R12: 0000000000000002 R13: 0000000000000000 R14: 00007f1282176080 R15: 00007ffcb9e23328 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:btrfs_split_ordered_extent+0xd8d/0xe20 fs/btrfs/ordered-data.c:1234 RSP: 0018:ffffc9000d1df2b8 EFLAGS: 00010246 RAX: 0000000000000057 RBX: 000000000006a000 RCX: 9ce21886c4195300 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: 0000000000000091 R08: ffffffff817f0a3c R09: 1ffff92001a3bdf4 R10: dffffc0000000000 R11: fffff52001a3bdf5 R12: 1ffff1100a45f401 R13: ffff8880522fa018 R14: dffffc0000000000 R15: 000000000006a000 FS: 00007f12819fe6c0(0000) GS:ffff88801fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000557750bd7da8 CR3: 00000000400ea000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 In this case the transaction abort was due to (an injected) memory allocation failure when attempting to allocate a new chunk. Reported-by: syzbot+f60d8337a5c8e8d92a77@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6777f2dd.050a0220.178762.0045.GAE@google.com/ Fixes: 52b1fdca23ac ("btrfs: handle completed ordered extents in btrfs_split_ordered_extent") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 30eceaf829a7e..4aca7475fd82c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1229,6 +1229,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); + /* + * If our ordered extent had an error there's no point in continuing. + * The error may have come from a transaction abort done either by this + * task or some other concurrent task, and the transaction abort path + * iterates over all existing ordered extents and sets the flag + * BTRFS_ORDERED_IOERR on them. + */ + if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) { + const int fs_error = BTRFS_FS_ERROR(fs_info); + + return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO); + } /* We cannot split partially completed ordered extents. */ if (ordered->bytes_left) { ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); -- GitLab From c9c863793395cf0a66c2778a29d72c48c02fbb66 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 20 Jan 2025 09:40:43 +1030 Subject: [PATCH 024/989] btrfs: do not output error message if a qgroup has been already cleaned up [BUG] There is a bug report that btrfs outputs the following error message: BTRFS info (device nvme0n1p2): qgroup scan completed (inconsistency flag cleared) BTRFS warning (device nvme0n1p2): failed to cleanup qgroup 0/1179: -2 [CAUSE] The error itself is pretty harmless, and the end user should ignore it. When a subvolume is fully dropped, btrfs will call btrfs_qgroup_cleanup_dropped_subvolume() to delete the qgroup. However if a qgroup rescan happened before a subvolume fully dropped, qgroup for that subvolume will not be re-created, as rescan will only create new qgroup if there is a BTRFS_ROOT_REF_KEY found. But before we drop a subvolume, the subvolume is unlinked thus there is no BTRFS_ROOT_REF_KEY. In that case, btrfs_remove_qgroup() will fail with -ENOENT and trigger the above error message. [FIX] Just ignore -ENOENT error from btrfs_remove_qgroup() inside btrfs_qgroup_cleanup_dropped_subvolume(). Reported-by: John Shand Link: https://bugzilla.suse.com/show_bug.cgi?id=1236056 Fixes: 839d6ea4f86d ("btrfs: automatically remove the subvolume qgroup") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b90fabe302e61..aaf16019d829a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1897,8 +1897,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su /* * It's squota and the subvolume still has numbers needed for future * accounting, in this case we can not delete it. Just skip it. + * + * Or the qgroup is already removed by a qgroup rescan. For both cases we're + * safe to ignore them. */ - if (ret == -EBUSY) + if (ret == -EBUSY || ret == -ENOENT) ret = 0; return ret; } -- GitLab From e2f0943cf37305dbdeaf9846e3c941451bcdef63 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Jan 2025 17:26:10 +0000 Subject: [PATCH 025/989] btrfs: fix use-after-free when attempting to join an aborted transaction When we are trying to join the current transaction and if it's aborted, we read its 'aborted' field after unlocking fs_info->trans_lock and without holding any extra reference count on it. This means that a concurrent task that is aborting the transaction may free the transaction before we read its 'aborted' field, leading to a use-after-free. Fix this by reading the 'aborted' field while holding fs_info->trans_lock since any freeing task must first acquire that lock and set fs_info->running_transaction to NULL before freeing the transaction. This was reported by syzbot and Dmitry with the following stack traces from KASAN: ================================================================== BUG: KASAN: slab-use-after-free in join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 Read of size 4 at addr ffff888011839024 by task kworker/u4:9/1128 CPU: 0 UID: 0 PID: 1128 Comm: kworker/u4:9 Not tainted 6.13.0-rc7-syzkaller-00019-gc45323b7560e #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_data_space Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 join_transaction+0xd9b/0xda0 fs/btrfs/transaction.c:278 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 flush_space+0x448/0xcf0 fs/btrfs/space-info.c:803 btrfs_async_reclaim_data_space+0x159/0x510 fs/btrfs/space-info.c:1321 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3317 worker_thread+0x870/0xd30 kernel/workqueue.c:3398 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5315: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4329 kmalloc_noprof include/linux/slab.h:901 [inline] join_transaction+0x144/0xda0 fs/btrfs/transaction.c:308 start_transaction+0xaf8/0x1670 fs/btrfs/transaction.c:697 btrfs_create_common+0x1b2/0x2e0 fs/btrfs/inode.c:6572 lookup_open fs/namei.c:3649 [inline] open_last_lookups fs/namei.c:3748 [inline] path_openat+0x1c03/0x3590 fs/namei.c:3984 do_filp_open+0x27f/0x4e0 fs/namei.c:4014 do_sys_openat2+0x13e/0x1d0 fs/open.c:1402 do_sys_open fs/open.c:1417 [inline] __do_sys_creat fs/open.c:1495 [inline] __se_sys_creat fs/open.c:1489 [inline] __x64_sys_creat+0x123/0x170 fs/open.c:1489 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5336: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x196/0x430 mm/slub.c:4761 cleanup_transaction fs/btrfs/transaction.c:2063 [inline] btrfs_commit_transaction+0x2c97/0x3720 fs/btrfs/transaction.c:2598 insert_balance_item+0x1284/0x20b0 fs/btrfs/volumes.c:3757 btrfs_balance+0x992/0x10c0 fs/btrfs/volumes.c:4633 btrfs_ioctl_balance+0x493/0x7c0 fs/btrfs/ioctl.c:3670 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff888011839000 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 36 bytes inside of freed 2048-byte region [ffff888011839000, ffff888011839800) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11838 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000040(head|node=0|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 raw: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000040 ffff88801ac42000 ffffea0000493400 dead000000000002 head: 0000000000000000 0000000000080008 00000001f5000000 0000000000000000 head: 00fff00000000003 ffffea0000460e01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 57, tgid 57 (kworker/0:2), ts 67248182943, free_ts 67229742023 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x1f3/0x230 mm/page_alloc.c:1558 prep_new_page mm/page_alloc.c:1566 [inline] get_page_from_freelist+0x365c/0x37a0 mm/page_alloc.c:3476 __alloc_pages_noprof+0x292/0x710 mm/page_alloc.c:4753 alloc_pages_mpol_noprof+0x3e1/0x780 mm/mempolicy.c:2269 alloc_slab_page+0x6a/0x110 mm/slub.c:2423 allocate_slab+0x5a/0x2b0 mm/slub.c:2589 new_slab mm/slub.c:2642 [inline] ___slab_alloc+0xc27/0x14a0 mm/slub.c:3830 __slab_alloc+0x58/0xa0 mm/slub.c:3920 __slab_alloc_node mm/slub.c:3995 [inline] slab_alloc_node mm/slub.c:4156 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_node_track_caller_noprof+0x2e9/0x4c0 mm/slub.c:4317 kmalloc_reserve+0x111/0x2a0 net/core/skbuff.c:609 __alloc_skb+0x1f3/0x440 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1323 [inline] alloc_skb_with_frags+0xc3/0x820 net/core/skbuff.c:6612 sock_alloc_send_pskb+0x91a/0xa60 net/core/sock.c:2884 sock_alloc_send_skb include/net/sock.h:1803 [inline] mld_newpack+0x1c3/0xaf0 net/ipv6/mcast.c:1747 add_grhead net/ipv6/mcast.c:1850 [inline] add_grec+0x1492/0x19a0 net/ipv6/mcast.c:1988 mld_send_cr net/ipv6/mcast.c:2114 [inline] mld_ifc_work+0x691/0xd90 net/ipv6/mcast.c:2651 page last free pid 5300 tgid 5300 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1127 [inline] free_unref_page+0xd3f/0x1010 mm/page_alloc.c:2659 __slab_free+0x2c2/0x380 mm/slub.c:4524 qlink_free mm/kasan/quarantine.c:163 [inline] qlist_free_all+0x9a/0x140 mm/kasan/quarantine.c:179 kasan_quarantine_reduce+0x14f/0x170 mm/kasan/quarantine.c:286 __kasan_slab_alloc+0x23/0x80 mm/kasan/common.c:329 kasan_slab_alloc include/linux/kasan.h:250 [inline] slab_post_alloc_hook mm/slub.c:4119 [inline] slab_alloc_node mm/slub.c:4168 [inline] __do_kmalloc_node mm/slub.c:4297 [inline] __kmalloc_noprof+0x236/0x4c0 mm/slub.c:4310 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] fib_create_info+0xc14/0x25b0 net/ipv4/fib_semantics.c:1435 fib_table_insert+0x1f6/0x1f20 net/ipv4/fib_trie.c:1231 fib_magic+0x3d8/0x620 net/ipv4/fib_frontend.c:1112 fib_add_ifaddr+0x40c/0x5e0 net/ipv4/fib_frontend.c:1156 fib_netdev_event+0x375/0x490 net/ipv4/fib_frontend.c:1494 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9045 do_setlink+0xc90/0x4210 net/core/rtnetlink.c:3109 rtnl_changelink net/core/rtnetlink.c:3723 [inline] __rtnl_newlink net/core/rtnetlink.c:3875 [inline] rtnl_newlink+0x1bb6/0x2210 net/core/rtnetlink.c:4012 Memory state around the buggy address: ffff888011838f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888011838f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888011839000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888011839080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888011839100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Reported-by: syzbot+45212e9d87a98c3f5b42@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/678e7da5.050a0220.303755.007c.GAE@google.com/ Reported-by: Dmitry Vyukov Link: https://lore.kernel.org/linux-btrfs/CACT4Y+ZFBdo7pT8L2AzM=vegZwjp-wNkVJZQf0Ta3vZqtExaSw@mail.gmail.com/ Fixes: 871383be592b ("btrfs: add missing unlocks to transaction abort paths") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 15312013f2a34..aca83a98b75a2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -274,8 +274,10 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { + const int abort_error = cur_trans->aborted; + spin_unlock(&fs_info->trans_lock); - return cur_trans->aborted; + return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); -- GitLab From fdef89ce6fada462aef9cb90a140c93c8c209f0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Jan 2025 12:24:39 +0000 Subject: [PATCH 026/989] btrfs: avoid starting new transaction when cleaning qgroup during subvolume drop At btrfs_qgroup_cleanup_dropped_subvolume() all we want to commit the current transaction in order to have all the qgroup rfer/excl numbers up to date. However we are using btrfs_start_transaction(), which joins the current transaction if there is one that is not yet committing, but also starts a new one if there is none or if the current one is already committing (its state is >= TRANS_STATE_COMMIT_START). This later case results in unnecessary IO, wasting time and a pointless rotation of the backup roots in the super block. So instead of using btrfs_start_transaction() followed by a btrfs_commit_transaction(), use btrfs_commit_current_transaction() which achieves our purpose and avoids starting and committing new transactions. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index aaf16019d829a..f9d3766c809b4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1880,11 +1880,7 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su * Commit current transaction to make sure all the rfer/excl numbers * get updated. */ - trans = btrfs_start_transaction(fs_info->quota_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(fs_info->quota_root); if (ret < 0) return ret; -- GitLab From 5e0e02f0d7e52cfc8b1adfc778dd02181d8b47b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jan 2025 09:05:15 -0700 Subject: [PATCH 027/989] futex: Pass in task to futex_queue() futex_queue() -> __futex_queue() uses 'current' as the task to store in the struct futex_q->task field. This is fine for synchronous usage of the futex infrastructure, but it's not always correct when used by io_uring where the task doing the initial futex_queue() might not be available later on. This doesn't lead to any issues currently, as the io_uring side doesn't support PI futexes, but it does leave a potentially dangling pointer which is never a good idea. Have futex_queue() take a task_struct argument, and have the regular callers pass in 'current' for that. Meanwhile io_uring can just pass in NULL, as the task should never be used off that path. In theory req->tctx->task could be used here, but there's no point populating it with a task field that will never be used anyway. Reported-by: Jann Horn Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/22484a23-542c-4003-b721-400688a0d055@kernel.dk --- io_uring/futex.c | 2 +- kernel/futex/core.c | 5 +++-- kernel/futex/futex.h | 11 ++++++++--- kernel/futex/pi.c | 2 +- kernel/futex/waitwake.c | 4 ++-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index 30139cc150f22..e5cc208810ad5 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -338,7 +338,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb); + futex_queue(&ifd->q, hb, NULL); return IOU_ISSUE_SKIP_COMPLETE; } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ebdd76b4ecbba..3db8567f5a44e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb) futex_hb_waiters_dec(hb); } -void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) { int prio; @@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); - q->task = current; + q->task = task; } /** diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 99b32e728c4ad..6b2f4c7eb720f 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) } extern void __futex_unqueue(struct futex_q *q); -extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); +extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket + * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The @@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q); * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). + * + * Note that @task may be NULL, for async usage of futexes. */ -static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) __releases(&hb->lock) { - __futex_queue(q, hb); + __futex_queue(q, hb, task); spin_unlock(&hb->lock); } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index daea650b16f51..7a941845f7eee 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -982,7 +982,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl /* * Only actually queue now that the atomic ops are done: */ - __futex_queue(&q, hb); + __futex_queue(&q, hb, current); if (trylock) { ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 3a10375d95218..a9056acb75eef 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -350,7 +350,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb); + futex_queue(q, hb, current); /* Arm the timer */ if (timeout) @@ -461,7 +461,7 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) * next futex. Queue each futex at this moment so hb can * be unlocked. */ - futex_queue(q, hb); + futex_queue(q, hb, current); continue; } -- GitLab From 915175b49f65d9edeb81659e82cbb27b621dbc17 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Wed, 15 Jan 2025 20:35:25 +0800 Subject: [PATCH 028/989] xfs: fix the entry condition of exact EOF block allocation optimization When we call create(), lseek() and write() sequentially, offset != 0 cannot be used as a judgment condition for whether the file already has extents. Furthermore, when xfs_bmap_adjacent() has not given a better blkno, it is not necessary to use exact EOF block allocation. Suggested-by: Dave Chinner Signed-off-by: Jinliang Zheng Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_bmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 40ad22fb808b9..0ef19f1469ec9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3563,12 +3563,12 @@ xfs_bmap_btalloc_at_eof( int error; /* - * If there are already extents in the file, try an exact EOF block - * allocation to extend the file as a contiguous extent. If that fails, - * or it's the first allocation in a file, just try for a stripe aligned - * allocation. + * If there are already extents in the file, and xfs_bmap_adjacent() has + * given a better blkno, try an exact EOF block allocation to extend the + * file as a contiguous extent. If that fails, or it's the first + * allocation in a file, just try for a stripe aligned allocation. */ - if (ap->offset) { + if (ap->eof) { xfs_extlen_t nextminlen = 0; /* @@ -3736,7 +3736,8 @@ xfs_bmap_btalloc_best_length( int error; ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino); - xfs_bmap_adjacent(ap); + if (!xfs_bmap_adjacent(ap)) + ap->eof = false; /* * Search for an allocation group with a single extent large enough for -- GitLab From 89841b23809f5fb12cbead142204064739fef25a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jan 2025 07:03:35 +0100 Subject: [PATCH 029/989] xfs: remove an out of data comment in _xfs_buf_alloc There hasn't been anything like an io_length for a long time. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 7fbdd4b30676c..f1252ed8bd0a7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -232,11 +232,6 @@ _xfs_buf_alloc( bp->b_mount = target->bt_mount; bp->b_flags = flags; - /* - * Set length and io_length to the same value initially. - * I/O routines should use io_length, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ error = xfs_buf_get_maps(bp, nmaps); if (error) { kmem_cache_free(xfs_buf_cache, bp); -- GitLab From f5f0ed89f13e3e5246404a322ee85169a226bfb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jan 2025 06:43:21 +0100 Subject: [PATCH 030/989] xfs: don't call remap_verify_area with sb write protection held The XFS_IOC_EXCHANGE_RANGE ioctl with the XFS_EXCHANGE_RANGE_TO_EOF flag operates on a range bounded by the end of the file. This means the actual amount of blocks exchanged is derived from the inode size, which is only stable with the IOLOCK (i_rwsem) held. Do that, it currently calls remap_verify_area from inside the sb write protection which nests outside the IOLOCK. But this makes fsnotify_file_area_perm which is called from remap_verify_area unhappy when the kernel is built with lockdep and the recently added CONFIG_FANOTIFY_ACCESS_PERMISSIONS option. Fix this by always calling remap_verify_area before taking the write protection, and passing a 0 size to remap_verify_area similar to the FICLONE/FICLONERANGE ioctls when they are asked to clone until the file end. (Note: the size argument gets passed to fsnotify_file_area_perm, but then isn't actually used there). Fixes: 9a64d9b3109d ("xfs: introduce new file range exchange ioctl") Cc: # v6.10 Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_exchrange.c | 71 ++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c index f340a2015c4c7..0b41bdfecdfbc 100644 --- a/fs/xfs/xfs_exchrange.c +++ b/fs/xfs/xfs_exchrange.c @@ -329,22 +329,6 @@ xfs_exchrange_mappings( * successfully but before locks are dropped. */ -/* Verify that we have security clearance to perform this operation. */ -static int -xfs_exchange_range_verify_area( - struct xfs_exchrange *fxr) -{ - int ret; - - ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length, - true); - if (ret) - return ret; - - return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length, - true); -} - /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. @@ -355,11 +339,13 @@ xfs_exchange_range_checks( unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); + loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); + loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; - loff_t size1, size2, tmp; + loff_t tmp; int error; /* Don't touch certain kinds of inodes */ @@ -368,24 +354,25 @@ xfs_exchange_range_checks( if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; - size1 = i_size_read(inode1); - size2 = i_size_read(inode2); - /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; - /* - * If the caller said to exchange to EOF, we set the length of the - * request large enough to cover everything to the end of both files. - */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { + /* + * If the caller said to exchange to EOF, we set the length of + * the request large enough to cover everything to the end of + * both files. + */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); - - error = xfs_exchange_range_verify_area(fxr); - if (error) - return error; + } else { + /* + * Otherwise we require both ranges to end within EOF. + */ + if (fxr->file1_offset + fxr->length > size1 || + fxr->file2_offset + fxr->length > size2) + return -EINVAL; } /* @@ -401,15 +388,6 @@ xfs_exchange_range_checks( check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) return -EINVAL; - /* - * We require both ranges to end within EOF, unless we're exchanging - * to EOF. - */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) && - (fxr->file1_offset + fxr->length > size1 || - fxr->file2_offset + fxr->length > size2)) - return -EINVAL; - /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole @@ -747,6 +725,7 @@ xfs_exchange_range( { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); + loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & @@ -779,14 +758,18 @@ xfs_exchange_range( return -EBADF; /* - * If we're not exchanging to EOF, we can check the areas before - * stabilizing both files' i_size. + * If we're exchanging to EOF we can't calculate the length until taking + * the iolock. Pass a 0 length to remap_verify_area similar to the + * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ - if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) { - ret = xfs_exchange_range_verify_area(fxr); - if (ret) - return ret; - } + if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) + check_len = 0; + ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); + if (ret) + return ret; + ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); + if (ret) + return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) -- GitLab From 58f5c8d5ca07a2f9fa93fb073f5b1646ec482ff2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 24 Jan 2025 13:00:33 +0200 Subject: [PATCH 031/989] nvmet: fix a memory leak in controller identify Simply free an allocated buffer once we copied its content to the request sgl. kmemleak complaint: unreferenced object 0xffff8cd40c388000 (size 4096): comm "kworker/2:2H", pid 14739, jiffies 4401313113 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): [] kmemleak_alloc+0x4a/0x90 [] __kmalloc_cache_noprof+0x35a/0x420 [] nvmet_execute_identify+0x912/0x9f0 [nvmet] [] nvmet_tcp_try_recv_pdu+0x84c/0xc90 [nvmet_tcp] [] nvmet_tcp_io_work+0x82/0x8b0 [nvmet_tcp] [] process_one_work+0x178/0x3e0 [] worker_thread+0x2ec/0x420 [] kthread+0xf0/0x120 [] ret_from_fork+0x44/0x70 [] ret_from_fork_asm+0x1a/0x30 Fixes: 84909f7decbd ("nvmet: use kzalloc instead of ZERO_PAGE in nvme_execute_identify_ns_nvm()") Signed-off-by: Sagi Grimberg Reviewed-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3ddd8e44e148c..ec7f70be6daa7 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1067,6 +1067,7 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) goto out; } status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + kfree(id); out: nvmet_req_complete(req, status); } -- GitLab From be8ee18152b0523752f3a44900363838bd1573bb Mon Sep 17 00:00:00 2001 From: Atul Kumar Pant Date: Sat, 18 Jan 2025 14:09:27 +0530 Subject: [PATCH 032/989] sched_ext: Fixes typos in comments Fixes some spelling errors in the comments. Signed-off-by: Atul Kumar Pant Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdde..283d7f1addc50 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -416,7 +416,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -1214,7 +1214,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -2078,7 +2078,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2480,7 +2480,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -3144,7 +3144,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -4590,7 +4590,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); -- GitLab From 2279563e3a8cac367b267b09c15cf1e39c06c5cc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 22 Jan 2025 10:05:25 +0100 Subject: [PATCH 033/989] sched_ext: Include task weight in the error state dump Report the task weight when dumping the task state during an error exit. Moreover, adjust the output format to display dsq_vtime, slice, and weight on the same line. This can help identify whether certain tasks were excessively prioritized or de-prioritized due to large niceness gaps. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 283d7f1addc50..7081c7be5f622 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5277,9 +5277,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { -- GitLab From 74ca334338a4489173d9e50775b13fa20cbd5958 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 23 Jan 2025 13:46:06 +0100 Subject: [PATCH 034/989] selftests/sched_ext: Fix enum resolution All scx enums are now automatically generated from vmlinux.h and they must be initialized using the SCX_ENUM_INIT() macro. Fix the scx selftests to use this macro to properly initialize these values. Fixes: 8da7bf2cee27 ("tools/sched_ext: Receive updates from SCX repo") Reported-by: Ihor Solodrai Closes: https://lore.kernel.org/all/Z2tNK2oFDX1OPp8C@slm.duckdns.org/ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- .../testing/selftests/sched_ext/create_dsq.c | 10 ++++---- .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 7 ++++-- .../sched_ext/ddsp_vtimelocal_fail.c | 7 ++++-- .../selftests/sched_ext/dsp_local_on.c | 1 + .../sched_ext/enq_last_no_enq_fails.c | 10 ++++---- .../sched_ext/enq_select_cpu_fails.c | 10 ++++---- tools/testing/selftests/sched_ext/exit.c | 1 + tools/testing/selftests/sched_ext/hotplug.c | 6 +++-- .../selftests/sched_ext/init_enable_count.c | 25 ++++++------------- tools/testing/selftests/sched_ext/maximal.c | 7 ++++-- tools/testing/selftests/sched_ext/minimal.c | 10 ++++---- tools/testing/selftests/sched_ext/prog_run.c | 10 ++++---- .../testing/selftests/sched_ext/reload_loop.c | 9 +++---- .../selftests/sched_ext/select_cpu_dfl.c | 7 ++++-- .../sched_ext/select_cpu_dfl_nodispatch.c | 7 ++++-- .../selftests/sched_ext/select_cpu_dispatch.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_bad_dsq.c | 7 ++++-- .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 7 ++++-- .../selftests/sched_ext/select_cpu_vtime.c | 7 ++++-- 19 files changed, 88 insertions(+), 67 deletions(-) diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c index fa946d9146d4d..d67431f57ac65 100644 --- a/tools/testing/selftests/sched_ext/create_dsq.c +++ b/tools/testing/selftests/sched_ext/create_dsq.c @@ -14,11 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct create_dsq *skel; - skel = create_dsq__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = create_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(create_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c index e65d22f23f3bc..b6d13496b24e8 100644 --- a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_bogus_dsq_fail *skel; - skel = ddsp_bogus_dsq_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_bogus_dsq_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_bogus_dsq_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c index abafee587cd60..af9ce4ee8baac 100644 --- a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c +++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct ddsp_vtimelocal_fail *skel; - skel = ddsp_vtimelocal_fail__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = ddsp_vtimelocal_fail__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(ddsp_vtimelocal_fail__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c index 0ff27e57fe430..e1f2ce4abfe64 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.c @@ -15,6 +15,7 @@ static enum scx_test_status setup(void **ctx) skel = dsp_local_on__open(); SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c index 73e679953e27a..d3387ae036794 100644 --- a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c +++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_last_no_enq_fails *skel; - skel = enq_last_no_enq_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_last_no_enq_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_last_no_enq_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c index dd1350e5f002d..a80e3a3b3698c 100644 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct enq_select_cpu_fails *skel; - skel = enq_select_cpu_fails__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = enq_select_cpu_fails__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c index 31bcd06e21cd3..9451782689de1 100644 --- a/tools/testing/selftests/sched_ext/exit.c +++ b/tools/testing/selftests/sched_ext/exit.c @@ -23,6 +23,7 @@ static enum scx_test_status run(void *ctx) char buf[16]; skel = exit__open(); + SCX_ENUM_INIT(skel); skel->rodata->exit_point = tc; exit__load(skel); link = bpf_map__attach_struct_ops(skel->maps.exit_ops); diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 87bf220b1bcee..1c9ceb661c43e 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -49,8 +49,10 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) SCX_ASSERT(is_cpu_online()); - skel = hotplug__open_and_load(); - SCX_ASSERT(skel); + skel = hotplug__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(hotplug__load(skel), "Failed to load skel"); /* Testing the offline -> online path, so go offline before starting */ if (onlining) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 97d45f1e5597e..0f3eddc7a17a0 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -15,22 +15,6 @@ #define SCHED_EXT 7 -static struct init_enable_count * -open_load_prog(bool global) -{ - struct init_enable_count *skel; - - skel = init_enable_count__open(); - SCX_BUG_ON(!skel, "Failed to open skel"); - - if (!global) - skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; - - SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); - - return skel; -} - static enum scx_test_status run_test(bool global) { struct init_enable_count *skel; @@ -40,7 +24,14 @@ static enum scx_test_status run_test(bool global) struct sched_param param = {}; pid_t pids[num_pre_forks]; - skel = open_load_prog(global); + skel = init_enable_count__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + + if (!global) + skel->struct_ops.init_enable_count_ops->flags |= SCX_OPS_SWITCH_PARTIAL; + + SCX_FAIL_IF(init_enable_count__load(skel), "Failed to load skel"); /* * Fork a bunch of children before we attach the scheduler so that we diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c index f38fc973c3800..c6be50a9941d5 100644 --- a/tools/testing/selftests/sched_ext/maximal.c +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -14,8 +14,11 @@ static enum scx_test_status setup(void **ctx) { struct maximal *skel; - skel = maximal__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c index 6c5db8ebbf8ac..89f7261757ffb 100644 --- a/tools/testing/selftests/sched_ext/minimal.c +++ b/tools/testing/selftests/sched_ext/minimal.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct minimal *skel; - skel = minimal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = minimal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(minimal__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c index 3cd57ef8daaa5..05974820ca69d 100644 --- a/tools/testing/selftests/sched_ext/prog_run.c +++ b/tools/testing/selftests/sched_ext/prog_run.c @@ -15,11 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct prog_run *skel; - skel = prog_run__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = prog_run__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(prog_run__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c index 5cfba2d6e0568..308211d804364 100644 --- a/tools/testing/selftests/sched_ext/reload_loop.c +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -18,11 +18,10 @@ bool force_exit = false; static enum scx_test_status setup(void **ctx) { - skel = maximal__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load skel"); - return SCX_TEST_FAIL; - } + skel = maximal__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); return SCX_TEST_PASS; } diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c index a53a40c2d2f0f..5b6e045e1109b 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl *skel; - skel = select_cpu_dfl__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c index 1d85bf4bf3a39..9b5d232efb7f6 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dfl_nodispatch *skel; - skel = select_cpu_dfl_nodispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dfl_nodispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dfl_nodispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c index 0309ca8785b36..80283dbc41b7e 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c @@ -17,8 +17,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch *skel; - skel = select_cpu_dispatch__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c index 47eb6ed7627d9..5e72ebbc90a5a 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_bad_dsq *skel; - skel = select_cpu_dispatch_bad_dsq__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_bad_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_bad_dsq__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c index 48ff028a3c46d..aa85949478bcf 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c +++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_dispatch_dbl_dsp *skel; - skel = select_cpu_dispatch_dbl_dsp__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_dispatch_dbl_dsp__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_dispatch_dbl_dsp__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c index b4629c2364f5d..1e9b5c9bfff1d 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c @@ -15,8 +15,11 @@ static enum scx_test_status setup(void **ctx) { struct select_cpu_vtime *skel; - skel = select_cpu_vtime__open_and_load(); - SCX_FAIL_IF(!skel, "Failed to open and load skel"); + skel = select_cpu_vtime__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(select_cpu_vtime__load(skel), "Failed to load skel"); + *ctx = skel; return SCX_TEST_PASS; -- GitLab From 64a1ba4072b34af1b76bf15fca5c2075b8cc4d64 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Thu, 23 Jan 2025 21:51:38 +0530 Subject: [PATCH 035/989] wifi: ath12k: fix handling of 6 GHz rules In the US country code, to avoid including 6 GHz rules in the 5 GHz rules list, the number of 5 GHz rules is set to a default constant value of 4 (REG_US_5G_NUM_REG_RULES). However, if there are more than 4 valid 5 GHz rules, the current logic will bypass the legitimate 6 GHz rules. For example, if there are 5 valid 5 GHz rules and 1 valid 6 GHz rule, the current logic will only consider 4 of the 5 GHz rules, treating the last valid rule as a 6 GHz rule. Consequently, the actual 6 GHz rule is never processed, leading to the eventual disabling of 6 GHz channels. To fix this issue, instead of hardcoding the value to 4, use a helper function to determine the number of 6 GHz rules present in the 5 GHz rules list and ignore only those rules. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00173-QCAHKSWPL_SILICONZ-1 Cc: stable@vger.kernel.org Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices") Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250123-fix_6ghz_rules_handling-v1-1-d734bfa58ff4@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 61 ++++++++++++++++++++------- drivers/net/wireless/ath/ath12k/wmi.h | 1 - 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index dced2aa9ba1a3..d953742b67e14 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -4681,6 +4681,22 @@ static struct ath12k_reg_rule return reg_rule_ptr; } +static u8 ath12k_wmi_ignore_num_extra_rules(struct ath12k_wmi_reg_rule_ext_params *rule, + u32 num_reg_rules) +{ + u8 num_invalid_5ghz_rules = 0; + u32 count, start_freq; + + for (count = 0; count < num_reg_rules; count++) { + start_freq = le32_get_bits(rule[count].freq_info, REG_RULE_START_FREQ); + + if (start_freq >= ATH12K_MIN_6G_FREQ) + num_invalid_5ghz_rules++; + } + + return num_invalid_5ghz_rules; +} + static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, struct sk_buff *skb, struct ath12k_reg_info *reg_info) @@ -4691,6 +4707,7 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, u32 num_2g_reg_rules, num_5g_reg_rules; u32 num_6g_reg_rules_ap[WMI_REG_CURRENT_MAX_AP_TYPE]; u32 num_6g_reg_rules_cl[WMI_REG_CURRENT_MAX_AP_TYPE][WMI_REG_MAX_CLIENT_TYPE]; + u8 num_invalid_5ghz_ext_rules; u32 total_reg_rules = 0; int ret, i, j; @@ -4784,20 +4801,6 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, memcpy(reg_info->alpha2, &ev->alpha2, REG_ALPHA2_LEN); - /* FIXME: Currently FW includes 6G reg rule also in 5G rule - * list for country US. - * Having same 6G reg rule in 5G and 6G rules list causes - * intersect check to be true, and same rules will be shown - * multiple times in iw cmd. So added hack below to avoid - * parsing 6G rule from 5G reg rule list, and this can be - * removed later, after FW updates to remove 6G reg rule - * from 5G rules list. - */ - if (memcmp(reg_info->alpha2, "US", 2) == 0) { - reg_info->num_5g_reg_rules = REG_US_5G_NUM_REG_RULES; - num_5g_reg_rules = reg_info->num_5g_reg_rules; - } - reg_info->dfs_region = le32_to_cpu(ev->dfs_region); reg_info->phybitmap = le32_to_cpu(ev->phybitmap); reg_info->num_phy = le32_to_cpu(ev->num_phy); @@ -4900,8 +4903,29 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, } } + ext_wmi_reg_rule += num_2g_reg_rules; + + /* Firmware might include 6 GHz reg rule in 5 GHz rule list + * for few countries along with separate 6 GHz rule. + * Having same 6 GHz reg rule in 5 GHz and 6 GHz rules list + * causes intersect check to be true, and same rules will be + * shown multiple times in iw cmd. + * Hence, avoid parsing 6 GHz rule from 5 GHz reg rule list + */ + num_invalid_5ghz_ext_rules = ath12k_wmi_ignore_num_extra_rules(ext_wmi_reg_rule, + num_5g_reg_rules); + + if (num_invalid_5ghz_ext_rules) { + ath12k_dbg(ab, ATH12K_DBG_WMI, + "CC: %s 5 GHz reg rules number %d from fw, %d number of invalid 5 GHz rules", + reg_info->alpha2, reg_info->num_5g_reg_rules, + num_invalid_5ghz_ext_rules); + + num_5g_reg_rules = num_5g_reg_rules - num_invalid_5ghz_ext_rules; + reg_info->num_5g_reg_rules = num_5g_reg_rules; + } + if (num_5g_reg_rules) { - ext_wmi_reg_rule += num_2g_reg_rules; reg_info->reg_rules_5g_ptr = create_ext_reg_rules_from_wmi(num_5g_reg_rules, ext_wmi_reg_rule); @@ -4913,7 +4937,12 @@ static int ath12k_pull_reg_chan_list_ext_update_ev(struct ath12k_base *ab, } } - ext_wmi_reg_rule += num_5g_reg_rules; + /* We have adjusted the number of 5 GHz reg rules above. But still those + * many rules needs to be adjusted in ext_wmi_reg_rule. + * + * NOTE: num_invalid_5ghz_ext_rules will be 0 for rest other cases. + */ + ext_wmi_reg_rule += (num_5g_reg_rules + num_invalid_5ghz_ext_rules); for (i = 0; i < WMI_REG_CURRENT_MAX_AP_TYPE; i++) { reg_info->reg_rules_6g_ap_ptr[i] = diff --git a/drivers/net/wireless/ath/ath12k/wmi.h b/drivers/net/wireless/ath/ath12k/wmi.h index 6f55dbdf629db..b16615b116ae7 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.h +++ b/drivers/net/wireless/ath/ath12k/wmi.h @@ -3943,7 +3943,6 @@ struct ath12k_wmi_eht_rate_set_params { #define MAX_REG_RULES 10 #define REG_ALPHA2_LEN 2 #define MAX_6G_REG_RULES 5 -#define REG_US_5G_NUM_REG_RULES 4 enum wmi_start_event_param { WMI_VDEV_START_RESP_EVENT = 0, -- GitLab From e76946110137703c16423baf6ee177b751a34b7e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 23 Jan 2025 16:25:35 +0800 Subject: [PATCH 036/989] workqueue: Put the pwq after detaching the rescuer from the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") adds code to reap the normal workers but mistakenly does not handle the rescuer and also removes the code waiting for the rescuer in put_unbound_pool(), which caused a use-after-free bug reported by Cheung Wall. To avoid the use-after-free bug, the pool’s reference must be held until the detachment is complete. Therefore, move the code that puts the pwq after detaching the rescuer from the pool. Reported-by: cheung wall Cc: cheung wall Link: https://lore.kernel.org/lkml/CAKHoSAvP3iQW+GwmKzWjEAOoPvzeWeoMO0Gz7Pp3_4kxt-RMoA@mail.gmail.com/ Fixes: 68f83057b913("workqueue: Reap workers via kthread_stop() and remove detach_completion") Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a23c7b22747..ccad33001c58c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3516,12 +3516,6 @@ static int rescuer_thread(void *__rescuer) } } - /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. @@ -3532,6 +3526,12 @@ static int rescuer_thread(void *__rescuer) worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } -- GitLab From e9fe182772dcb2630964724fd93e9c90b68ea0fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 10:48:25 -1000 Subject: [PATCH 037/989] sched_ext: selftests/dsp_local_on: Fix sporadic failures dsp_local_on has several incorrect assumptions, one of which is that p->nr_cpus_allowed always tracks p->cpus_ptr. This is not true when a task is scheduled out while migration is disabled - p->cpus_ptr is temporarily overridden to the previous CPU while p->nr_cpus_allowed remains unchanged. This led to sporadic test faliures when dsp_local_on_dispatch() tries to put a migration disabled task to a different CPU. Fix it by keeping the previous CPU when migration is disabled. There are SCX schedulers that make use of p->nr_cpus_allowed. They should also implement explicit handling for p->migration_disabled. Signed-off-by: Tejun Heo Reported-by: Ihor Solodrai Cc: Andrea Righi Cc: Changwoo Min --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index fbda6bf546712..758b479bd1ee1 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus) + if (p->nr_cpus_allowed == nr_cpus && !p->migration_disabled) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); -- GitLab From e0f63bc68f59d281e2d06e596f6c1bd9382a15cd Mon Sep 17 00:00:00 2001 From: Gustavo Sousa Date: Tue, 21 Jan 2025 18:09:25 -0300 Subject: [PATCH 038/989] drm/print: Include drm_device.h The header drm_print.h uses members of struct drm_device pointers, as such, it should include drm_device.h to let the compiler know the full type definition. Without such include, users of drm_print.h that don't explicitly need drm_device.h would bump into build errors and be forced to include the latter. Signed-off-by: Gustavo Sousa Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20250121210935.84357-1-gustavo.sousa@intel.com Signed-off-by: Lucas De Marchi --- include/drm/drm_print.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h index b3906dc043886..8d3e17d7554a8 100644 --- a/include/drm/drm_print.h +++ b/include/drm/drm_print.h @@ -32,6 +32,7 @@ #include #include +#include struct debugfs_regset32; struct drm_device; -- GitLab From 41f198d58b6f2b36f9f8a4481d517369b324e773 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Fri, 24 Jan 2025 01:18:00 +0530 Subject: [PATCH 039/989] tomoyo: fix spelling error Fix spelling error in security/tomoyo module comments that were identified using the codespell tool. No functional changes - documentation only. Signed-off-by: Tanya Agarwal Reviewed-by: Mimi Zohar Signed-off-by: Tetsuo Handa --- security/tomoyo/domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 3a7b0874cf44d..5f9ccab26e9ab 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -920,7 +920,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_MMU /* * This is called at execve() time in order to dig around - * in the argv/environment of the new proceess + * in the argv/environment of the new process * (represented by bprm). */ mmap_read_lock(bprm->mm); -- GitLab From 691a1f3f180133965d01e0ab0f332248d0345554 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 26 Jan 2025 19:13:58 +0900 Subject: [PATCH 040/989] tomoyo: fix spelling errors No functional changes. Signed-off-by: Tetsuo Handa --- security/tomoyo/securityfs_if.c | 6 +++--- security/tomoyo/tomoyo.c | 5 +---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/security/tomoyo/securityfs_if.c b/security/tomoyo/securityfs_if.c index a2705798476f9..7e69747b2f771 100644 --- a/security/tomoyo/securityfs_if.c +++ b/security/tomoyo/securityfs_if.c @@ -229,11 +229,11 @@ static void __init tomoyo_create_entry(const char *name, const umode_t mode, } /** - * tomoyo_initerface_init - Initialize /sys/kernel/security/tomoyo/ interface. + * tomoyo_interface_init - Initialize /sys/kernel/security/tomoyo/ interface. * * Returns 0. */ -static int __init tomoyo_initerface_init(void) +static int __init tomoyo_interface_init(void) { struct tomoyo_domain_info *domain; struct dentry *tomoyo_dir; @@ -270,4 +270,4 @@ static int __init tomoyo_initerface_init(void) return 0; } -fs_initcall(tomoyo_initerface_init); +fs_initcall(tomoyo_interface_init); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 04a92c3d65d44..d6ebcd9db80a3 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -549,10 +549,7 @@ static const struct lsm_id tomoyo_lsmid = { .id = LSM_ID_TOMOYO, }; -/* - * tomoyo_security_ops is a "struct security_operations" which is used for - * registering TOMOYO. - */ +/* tomoyo_hooks is used for registering TOMOYO. */ static struct security_hook_list tomoyo_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare), LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds), -- GitLab From 1f566840a82982141f94086061927a90e79440e5 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 24 Jan 2025 20:54:41 -0500 Subject: [PATCH 041/989] clocksource: Use pr_info() for "Checking clocksource synchronization" message The "Checking clocksource synchronization" message is normally printed when clocksource_verify_percpu() is called for a given clocksource if both the CLOCK_SOURCE_UNSTABLE and CLOCK_SOURCE_VERIFY_PERCPU flags are set. It is an informational message and so pr_info() is the correct choice. Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Acked-by: John Stultz Link: https://lore.kernel.org/all/20250125015442.3740588-1-longman@redhat.com --- kernel/time/clocksource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7304d7cf47f2d..77d9566d3aa68 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -382,7 +382,8 @@ void clocksource_verify_percpu(struct clocksource *cs) return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; -- GitLab From 825c78e6a60c309a59d18d5ac5968aa79cef0bd6 Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Mon, 27 Jan 2025 17:38:46 +0800 Subject: [PATCH 042/989] irqchip/riscv: Ensure ordering of memory writes and IPI writes RISC-V distinguishes between memory accesses and device I/O and uses FENCE instruction to order them as viewed by other RISC-V harts and external devices or coprocessors. The FENCE instruction can order any combination of device input(I), device output(O), memory reads(R) and memory writes(W). For example, 'fence w, o' is used to ensure all memory writes from instructions preceding the FENCE instruction appear earlier in the global memory order than device output writes from instructions after the FENCE instruction. RISC-V issues IPIs by writing to the IMSIC/ACLINT MMIO registers, which is regarded as device output operation. However, the existing implementation of the IMSIC/ACLINT drivers issue the IPI via writel_relaxed(), which does not guarantee the order of device output operation and preceding memory writes. As a consequence the hart receiving the IPI might not observe the IPI related data. Fix this by replacing writel_relaxed() with writel() when issuing IPIs, which uses 'fence w, o' to ensure all previous writes made by the current hart are visible to other harts before they receive the IPI. Signed-off-by: Xu Lu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250127093846.98625-1-luxu.kernel@bytedance.com --- drivers/irqchip/irq-riscv-imsic-early.c | 2 +- drivers/irqchip/irq-thead-c900-aclint-sswi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index c5c2e6929a2f5..275df50057057 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -27,7 +27,7 @@ static void imsic_ipi_send(unsigned int cpu) { struct imsic_local_config *local = per_cpu_ptr(imsic->global.local, cpu); - writel_relaxed(IMSIC_IPI_ID, local->msi_va); + writel(IMSIC_IPI_ID, local->msi_va); } static void imsic_ipi_starting_cpu(void) diff --git a/drivers/irqchip/irq-thead-c900-aclint-sswi.c b/drivers/irqchip/irq-thead-c900-aclint-sswi.c index b0e366ade4271..8ff6e7a1363bd 100644 --- a/drivers/irqchip/irq-thead-c900-aclint-sswi.c +++ b/drivers/irqchip/irq-thead-c900-aclint-sswi.c @@ -31,7 +31,7 @@ static DEFINE_PER_CPU(void __iomem *, sswi_cpu_regs); static void thead_aclint_sswi_ipi_send(unsigned int cpu) { - writel_relaxed(0x1, per_cpu(sswi_cpu_regs, cpu)); + writel(0x1, per_cpu(sswi_cpu_regs, cpu)); } static void thead_aclint_sswi_ipi_clear(void) -- GitLab From 987f379b54091cc1b1db986bde71cee1081350b3 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Fri, 24 Jan 2025 09:50:39 +0100 Subject: [PATCH 043/989] irqchip/irq-mvebu-icu: Fix access to msi_data from irq_domain::host_data mvebu_icu_translate() incorrectly casts irq_domain::host_data directly to mvebu_icu_msi_data. However, host_data actually points to a structure of type msi_domain_info. This incorrect cast causes issues such as the thermal sensors of the CP110 platform malfunctioning. Specifically, the translation of the SEI interrupt to IRQ_TYPE_EDGE_RISING fails, preventing proper interrupt handling. The following error was observed: genirq: Setting trigger mode 4 for irq 85 failed (irq_chip_set_type_parent+0x0/0x34) armada_thermal f2400000.system-controller:thermal-sensor@70: Cannot request threaded IRQ 85 Resolve the issue by first casting host_data to msi_domain_info and then accessing mvebu_icu_msi_data through msi_domain_info::chip_data. Fixes: d929e4db22b6 ("irqchip/irq-mvebu-icu: Prepare for real per device MSI") Signed-off-by: Stefan Eichenberger Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250124085140.44792-1-eichest@gmail.com --- drivers/irqchip/irq-mvebu-icu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mvebu-icu.c b/drivers/irqchip/irq-mvebu-icu.c index b337f6c05f184..4eebed39880a5 100644 --- a/drivers/irqchip/irq-mvebu-icu.c +++ b/drivers/irqchip/irq-mvebu-icu.c @@ -68,7 +68,8 @@ static int mvebu_icu_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { unsigned int param_count = static_branch_unlikely(&legacy_bindings) ? 3 : 2; - struct mvebu_icu_msi_data *msi_data = d->host_data; + struct msi_domain_info *info = d->host_data; + struct mvebu_icu_msi_data *msi_data = info->chip_data; struct mvebu_icu *icu = msi_data->icu; /* Check the count of the parameters in dt */ -- GitLab From fb95897b8c60653805aa09daec575ca30983f768 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Fri, 24 Jan 2025 11:22:28 +0800 Subject: [PATCH 044/989] xfs: Propagate errors from xfs_reflink_cancel_cow_range in xfs_dax_write_iomap_end In xfs_dax_write_iomap_end(), directly return the result of xfs_reflink_cancel_cow_range() when !written, ensuring proper error propagation and improving code robustness. Fixes: ea6c49b784f0 ("xfs: support CoW in fsdax mode") Cc: stable@vger.kernel.org # v6.0 Reviewed-by: Darrick J. Wong Signed-off-by: Wentao Liang Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_iomap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c9..d61460309a783 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -976,10 +976,8 @@ xfs_dax_write_iomap_end( if (!xfs_is_cow_inode(ip)) return 0; - if (!written) { - xfs_reflink_cancel_cow_range(ip, pos, length, true); - return 0; - } + if (!written) + return xfs_reflink_cancel_cow_range(ip, pos, length, true); return xfs_reflink_end_cow(ip, pos, written); } -- GitLab From 28aecef5b1015bf6023ddc12b1a67f6678271fcb Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sun, 19 Jan 2025 22:02:38 +0530 Subject: [PATCH 045/989] selftests: livepatch: handle PRINTK_CALLER in check_result() Some arch configs (like ppc64) enable CONFIG_PRINTK_CALLER, which adds the caller id as part of the dmesg. With recent util-linux's update 467a5b3192f16 ('dmesg: add caller_id support') the standard "dmesg" has been enhanced to print PRINTK_CALLER fields. Due to this, even though the expected vs observed are same, end testcase results are failed. -% insmod test_modules/test_klp_livepatch.ko -livepatch: enabling patch 'test_klp_livepatch' -livepatch: 'test_klp_livepatch': initializing patching transition -livepatch: 'test_klp_livepatch': starting patching transition -livepatch: 'test_klp_livepatch': completing patching transition -livepatch: 'test_klp_livepatch': patching complete -% echo 0 > /sys/kernel/livepatch/test_klp_livepatch/enabled -livepatch: 'test_klp_livepatch': initializing unpatching transition -livepatch: 'test_klp_livepatch': starting unpatching transition -livepatch: 'test_klp_livepatch': completing unpatching transition -livepatch: 'test_klp_livepatch': unpatching complete -% rmmod test_klp_livepatch +[ T3659] % insmod test_modules/test_klp_livepatch.ko +[ T3682] livepatch: enabling patch 'test_klp_livepatch' +[ T3682] livepatch: 'test_klp_livepatch': initializing patching transition +[ T3682] livepatch: 'test_klp_livepatch': starting patching transition +[ T826] livepatch: 'test_klp_livepatch': completing patching transition +[ T826] livepatch: 'test_klp_livepatch': patching complete +[ T3659] % echo 0 > /sys/kernel/livepatch/test_klp_livepatch/enabled +[ T3659] livepatch: 'test_klp_livepatch': initializing unpatching transition +[ T3659] livepatch: 'test_klp_livepatch': starting unpatching transition +[ T789] livepatch: 'test_klp_livepatch': completing unpatching transition +[ T789] livepatch: 'test_klp_livepatch': unpatching complete +[ T3659] % rmmod test_klp_livepatch ERROR: livepatch kselftest(s) failed not ok 1 selftests: livepatch: test-livepatch.sh # exit=1 Currently the check_result() handles the "[time]" removal from the dmesg. Enhance the check to also handle removal of "[Thread Id]" or "[CPU Id]". Signed-off-by: Madhavan Srinivasan Acked-by: Miroslav Benes Reviewed-by: Petr Mladek Tested-by: Petr Mladek Link: https://lore.kernel.org/r/20250119163238.749847-1-maddy@linux.ibm.com Signed-off-by: Petr Mladek --- tools/testing/selftests/livepatch/functions.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh index e5d06fb402335..15601402dee65 100644 --- a/tools/testing/selftests/livepatch/functions.sh +++ b/tools/testing/selftests/livepatch/functions.sh @@ -306,7 +306,8 @@ function check_result { result=$(dmesg | awk -v last_dmesg="$LAST_DMESG" 'p; $0 == last_dmesg { p=1 }' | \ grep -e 'livepatch:' -e 'test_klp' | \ grep -v '\(tainting\|taints\) kernel' | \ - sed 's/^\[[ 0-9.]*\] //') + sed 's/^\[[ 0-9.]*\] //' | \ + sed 's/^\[[ ]*[CT][0-9]*\] //') if [[ "$expect" == "$result" ]] ; then echo "ok" -- GitLab From 26b63bee2f6e711c5a169997fd126fddcfb90848 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Fri, 24 Jan 2025 11:45:09 +0800 Subject: [PATCH 046/989] xfs: Add error handling for xfs_reflink_cancel_cow_range In xfs_inactive(), xfs_reflink_cancel_cow_range() is called without error handling, risking unnoticed failures and inconsistent behavior compared to other parts of the code. Fix this issue by adding an error handling for the xfs_reflink_cancel_cow_range(), improving code robustness. Fixes: 6231848c3aa5 ("xfs: check for cow blocks before trying to clear them") Cc: stable@vger.kernel.org # v4.17 Reviewed-by: Darrick J. Wong Signed-off-by: Wentao Liang Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c95fe1b1de4e6..b1f9f156ec888 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1404,8 +1404,11 @@ xfs_inactive( goto out; /* Try to clean out the cow blocks if there are any. */ - if (xfs_inode_has_cow_data(ip)) - xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (xfs_inode_has_cow_data(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error) + goto out; + } if (VFS_I(ip)->i_nlink != 0) { /* -- GitLab From 698244bbb3bfd32ddf9a0b70a12b1c7d69056497 Mon Sep 17 00:00:00 2001 From: Nick Chan Date: Sun, 19 Jan 2025 00:31:42 +0800 Subject: [PATCH 047/989] irqchip/apple-aic: Only handle PMC interrupt as FIQ when configured so The CPU PMU in Apple SoCs can be configured to fire its interrupt in one of several ways, and since Apple A11 one of the methods is FIQ, but the check of the configuration register fails to test explicitely for FIQ mode. It tests whether the IMODE bitfield is zero or not and the PMCRO_IACT bit is set. That results in false positives when the IMODE bitfield is not zero, but does not have the mode PMCR0_IMODE_FIQ. Only handle the PMC interrupt as a FIQ when the CPU PMU has been configured to fire FIQs, i.e. the IMODE bitfield value is PMCR0_IMODE_FIQ and PMCR0_IACT is set. Fixes: c7708816c944 ("irqchip/apple-aic: Wire PMU interrupts") Signed-off-by: Nick Chan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250118163554.16733-1-towinchenmi@gmail.com --- drivers/irqchip/irq-apple-aic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-apple-aic.c b/drivers/irqchip/irq-apple-aic.c index da5250f0155cf..2b1684c60e3ca 100644 --- a/drivers/irqchip/irq-apple-aic.c +++ b/drivers/irqchip/irq-apple-aic.c @@ -577,7 +577,8 @@ static void __exception_irq_entry aic_handle_fiq(struct pt_regs *regs) AIC_FIQ_HWIRQ(AIC_TMR_EL02_VIRT)); } - if (read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & PMCR0_IACT) { + if ((read_sysreg_s(SYS_IMP_APL_PMCR0_EL1) & (PMCR0_IMODE | PMCR0_IACT)) == + (FIELD_PREP(PMCR0_IMODE, PMCR0_IMODE_FIQ) | PMCR0_IACT)) { int irq; if (cpumask_test_cpu(smp_processor_id(), &aic_irqc->fiq_aff[AIC_CPU_PMU_P]->aff)) -- GitLab From d6f3e7d564b2309e1f17e709a70eca78d7ca2bb8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Jan 2025 12:22:12 -1000 Subject: [PATCH 048/989] sched_ext: Fix incorrect autogroup migration detection scx_move_task() is called from sched_move_task() and tells the BPF scheduler that cgroup migration is being committed. sched_move_task() is used by both cgroup and autogroup migrations and scx_move_task() tried to filter out autogroup migrations by testing the destination cgroup and PF_EXITING but this is not enough. In fact, without explicitly tagging the thread which is doing the cgroup migration, there is no good way to tell apart scx_move_task() invocations for racing migration to the root cgroup and an autogroup migration. This led to scx_move_task() incorrectly ignoring a migration from non-root cgroup to an autogroup of the root cgroup triggering the following warning: WARNING: CPU: 7 PID: 1 at kernel/sched/ext.c:3725 scx_cgroup_can_attach+0x196/0x340 ... Call Trace: cgroup_migrate_execute+0x5b1/0x700 cgroup_attach_task+0x296/0x400 __cgroup_procs_write+0x128/0x140 cgroup_procs_write+0x17/0x30 kernfs_fop_write_iter+0x141/0x1f0 vfs_write+0x31d/0x4a0 __x64_sys_write+0x72/0xf0 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it by adding an argument to sched_move_task() that indicates whether the moving is for a cgroup or autogroup migration. After the change, scx_move_task() is called only for cgroup migrations and renamed to scx_cgroup_move_task(). Link: https://github.com/sched-ext/scx/issues/370 Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Tejun Heo --- kernel/sched/autogroup.c | 4 ++-- kernel/sched/core.c | 7 ++++--- kernel/sched/ext.c | 15 +-------------- kernel/sched/ext.h | 4 ++-- kernel/sched/sched.h | 2 +- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e2..c4a3ccf6a8ace 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 901170708e2a2..e77897a62442e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9042,7 +9042,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -9071,7 +9071,8 @@ void sched_move_task(struct task_struct *tsk) put_prev_task(rq, tsk); sched_change_group(tsk, group); - scx_move_task(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9172,7 +9173,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7081c7be5f622..c7b159f488343 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4323,24 +4323,11 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; - /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7dd..1079b56b0f7ae 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda26..b93c8c3dc05a5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -- GitLab From 5f52bbf2f6e0997394cf9c449d44e1c80ff4282c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 25 Jan 2025 18:14:12 +0100 Subject: [PATCH 049/989] tools/sched_ext: Add helper to check task migration state Introduce a new helper for BPF schedulers to determine whether a task can migrate or not (supporting both SMP and UP systems). Fixes: e9fe182772dc ("sched_ext: selftests/dsp_local_on: Fix sporadic failures") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f3e15e9efa76b..f254a39b86a58 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -404,6 +404,17 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) return (const struct cpumask *)mask; } +/* + * Return true if task @p cannot migrate to a different CPU, false + * otherwise. + */ +static inline bool is_migration_disabled(const struct task_struct *p) +{ + if (bpf_core_field_exists(p->migration_disabled)) + return p->migration_disabled; + return false; +} + /* rcu */ void bpf_rcu_read_lock(void) __ksym; void bpf_rcu_read_unlock(void) __ksym; -- GitLab From 3c7d51b0d29954c40ea3a097e0ec7884b4344331 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 25 Jan 2025 10:36:07 +0100 Subject: [PATCH 050/989] sched_ext: selftests/dsp_local_on: Fix selftest on UP systems In UP systems p->migration_disabled is not available. Fix this by using the portable helper is_migration_disabled(p). Fixes: e9fe182772dc ("sched_ext: selftests/dsp_local_on: Fix sporadic failures") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dsp_local_on.bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c index 758b479bd1ee1..c02b2aa6fc641 100644 --- a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c +++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c @@ -43,7 +43,7 @@ void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) if (!p) return; - if (p->nr_cpus_allowed == nr_cpus && !p->migration_disabled) + if (p->nr_cpus_allowed == nr_cpus && !is_migration_disabled(p)) target = bpf_get_prandom_u32() % nr_cpus; else target = scx_bpf_task_cpu(p); -- GitLab From 1626e5ef0b00386a4fd083fa7c46c8edbd75f9b4 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 27 Jan 2025 23:06:16 +0100 Subject: [PATCH 051/989] sched_ext: Fix lock imbalance in dispatch_to_local_dsq() While performing the rq locking dance in dispatch_to_local_dsq(), we may trigger the following lock imbalance condition, in particular when multiple tasks are rapidly changing CPU affinity (i.e., running a `stress-ng --race-sched 0`): [ 13.413579] ===================================== [ 13.413660] WARNING: bad unlock balance detected! [ 13.413729] 6.13.0-virtme #15 Not tainted [ 13.413792] ------------------------------------- [ 13.413859] kworker/1:1/80 is trying to release lock (&rq->__lock) at: [ 13.413954] [] dispatch_to_local_dsq+0x108/0x1a0 [ 13.414111] but there are no more locks to release! [ 13.414176] [ 13.414176] other info that might help us debug this: [ 13.414258] 1 lock held by kworker/1:1/80: [ 13.414318] #0: ffff8b66feb41698 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x20/0x90 [ 13.414612] [ 13.414612] stack backtrace: [ 13.415255] CPU: 1 UID: 0 PID: 80 Comm: kworker/1:1 Not tainted 6.13.0-virtme #15 [ 13.415505] Workqueue: 0x0 (events) [ 13.415567] Sched_ext: dsp_local_on (enabled+all), task: runnable_at=-2ms [ 13.415570] Call Trace: [ 13.415700] [ 13.415744] dump_stack_lvl+0x78/0xe0 [ 13.415806] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.415884] print_unlock_imbalance_bug+0x11b/0x130 [ 13.415965] ? dispatch_to_local_dsq+0x108/0x1a0 [ 13.416226] lock_release+0x231/0x2c0 [ 13.416326] _raw_spin_unlock+0x1b/0x40 [ 13.416422] dispatch_to_local_dsq+0x108/0x1a0 [ 13.416554] flush_dispatch_buf+0x199/0x1d0 [ 13.416652] balance_one+0x194/0x370 [ 13.416751] balance_scx+0x61/0x1e0 [ 13.416848] prev_balance+0x43/0xb0 [ 13.416947] __pick_next_task+0x6b/0x1b0 [ 13.417052] __schedule+0x20d/0x1740 This happens because dispatch_to_local_dsq() is racing with dispatch_dequeue() and, when the latter wins, we incorrectly assume that the task has been moved to dst_rq. Fix by properly tracking the currently locked rq. Fixes: 4d3ca89bdd31 ("sched_ext: Refactor consume_remote_task()") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c7b159f488343..a6d6d6dadde51 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2575,6 +2575,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2611,8 +2614,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2634,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2644,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ -- GitLab From a9ab28b3d21aec6d0f56fe722953e20ce470237b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jan 2025 06:22:58 +0100 Subject: [PATCH 052/989] xfs: remove xfs_buf_cache.bc_lock xfs_buf_cache.bc_lock serializes adding buffers to and removing them from the hashtable. But as the rhashtable code already uses fine grained internal locking for inserts and removals the extra protection isn't actually required. It also happens to fix a lock order inversion vs b_lock added by the recent lookup race fix. Fixes: ee10f6fcdb96 ("xfs: fix buffer lookup vs release race") Reported-by: Lai, Yi Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_buf.c | 31 +++++++++++++++++-------------- fs/xfs/xfs_buf.h | 1 - 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f1252ed8bd0a7..ef207784876c8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -41,8 +41,7 @@ struct kmem_cache *xfs_buf_cache; * * xfs_buf_rele: * b_lock - * pag_buf_lock - * lru_lock + * lru_lock * * xfs_buftarg_drain_rele * lru_lock @@ -220,14 +219,21 @@ _xfs_buf_alloc( */ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - spin_lock_init(&bp->b_lock); + /* + * A new buffer is held and locked by the owner. This ensures that the + * buffer is owned by the caller and racing RCU lookups right after + * inserting into the hash table are safe (and will have to wait for + * the unlock to do anything non-trivial). + */ bp->b_hold = 1; + sema_init(&bp->b_sema, 0); /* held, no waiters */ + + spin_lock_init(&bp->b_lock); atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); - sema_init(&bp->b_sema, 0); /* held, no waiters */ bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; @@ -497,7 +503,6 @@ int xfs_buf_cache_init( struct xfs_buf_cache *bch) { - spin_lock_init(&bch->bc_lock); return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } @@ -647,17 +652,20 @@ xfs_buf_find_insert( if (error) goto out_free_buf; - spin_lock(&bch->bc_lock); + /* The new buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + + rcu_read_lock(); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { + rcu_read_unlock(); error = PTR_ERR(bp); - spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ - spin_unlock(&bch->bc_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -665,10 +673,8 @@ xfs_buf_find_insert( *bpp = bp; goto out_free_buf; } + rcu_read_unlock(); - /* The new buffer keeps the perag reference until it is freed. */ - new_bp->b_pag = pag; - spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; @@ -1085,7 +1091,6 @@ xfs_buf_rele_cached( } /* we are asked to drop the last reference */ - spin_lock(&bch->bc_lock); __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* @@ -1097,7 +1102,6 @@ xfs_buf_rele_cached( bp->b_state &= ~XFS_BSTATE_DISPOSE; else bp->b_hold--; - spin_unlock(&bch->bc_lock); } else { bp->b_hold--; /* @@ -1115,7 +1119,6 @@ xfs_buf_rele_cached( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, xfs_buf_hash_params); - spin_unlock(&bch->bc_lock); if (pag) xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7e73663c5d4a5..3b4ed42e11c01 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -80,7 +80,6 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { - spinlock_t bc_lock; struct rhashtable bc_hash; }; -- GitLab From fd39c41bcd82d5ebaaebadb944eab5598c668a90 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 27 Jan 2025 14:44:14 +0100 Subject: [PATCH 053/989] drm/ast: astdp: Fix timeout for enabling video signal The ASTDP transmitter sometimes takes up to 1 second for enabling the video signal, while the timeout is only 200 msec. This results in a kernel error message. Increase the timeout to 1 second. An example of the error message is shown below. [ 697.084433] ------------[ cut here ]------------ [ 697.091115] ast 0000:02:00.0: [drm] drm_WARN_ON(!__ast_dp_wait_enable(ast, enabled)) [ 697.091233] WARNING: CPU: 1 PID: 160 at drivers/gpu/drm/ast/ast_dp.c:232 ast_dp_set_enable+0x123/0x140 [ast] [...] [ 697.272469] RIP: 0010:ast_dp_set_enable+0x123/0x140 [ast] [...] [ 697.415283] Call Trace: [ 697.420727] [ 697.425908] ? show_trace_log_lvl+0x196/0x2c0 [ 697.433304] ? show_trace_log_lvl+0x196/0x2c0 [ 697.440693] ? drm_atomic_helper_commit_modeset_enables+0x30a/0x470 [ 697.450115] ? ast_dp_set_enable+0x123/0x140 [ast] [ 697.458059] ? __warn.cold+0xaf/0xca [ 697.464713] ? ast_dp_set_enable+0x123/0x140 [ast] [ 697.472633] ? report_bug+0x134/0x1d0 [ 697.479544] ? handle_bug+0x58/0x90 [ 697.486127] ? exc_invalid_op+0x13/0x40 [ 697.492975] ? asm_exc_invalid_op+0x16/0x20 [ 697.500224] ? preempt_count_sub+0x14/0xc0 [ 697.507473] ? ast_dp_set_enable+0x123/0x140 [ast] [ 697.515377] ? ast_dp_set_enable+0x123/0x140 [ast] [ 697.523227] drm_atomic_helper_commit_modeset_enables+0x30a/0x470 [ 697.532388] drm_atomic_helper_commit_tail+0x58/0x90 [ 697.540400] ast_mode_config_helper_atomic_commit_tail+0x30/0x40 [ast] [ 697.550009] commit_tail+0xfe/0x1d0 [ 697.556547] drm_atomic_helper_commit+0x198/0x1c0 This is a cosmetical problem. Enabling the video signal still works even with the error message. The problem has always been present, but only recent versions of the ast driver warn about missing the timeout. Signed-off-by: Thomas Zimmermann Fixes: 4e29cc7c5c67 ("drm/ast: astdp: Replace ast_dp_set_on_off()") Cc: Thomas Zimmermann Cc: Jocelyn Falempe Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: # v6.13+ Reviewed-by: Jocelyn Falempe Link: https://patchwork.freedesktop.org/patch/msgid/20250127134423.84266-1-tzimmermann@suse.de --- drivers/gpu/drm/ast/ast_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c index 0e282b7b167c6..b9eb67e3fa90e 100644 --- a/drivers/gpu/drm/ast/ast_dp.c +++ b/drivers/gpu/drm/ast/ast_dp.c @@ -195,7 +195,7 @@ static bool __ast_dp_wait_enable(struct ast_device *ast, bool enabled) if (enabled) vgacrdf_test |= AST_IO_VGACRDF_DP_VIDEO_ENABLE; - for (i = 0; i < 200; ++i) { + for (i = 0; i < 1000; ++i) { if (i) mdelay(1); vgacrdf = ast_get_index_reg_mask(ast, AST_IO_VGACRI, 0xdf, -- GitLab From 7bf6b497a747b0e28a411beacdd62f1488d0781c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jan 2025 08:55:33 +0100 Subject: [PATCH 054/989] nvmet: the result field in nvmet_alloc_ctrl_args is little endian So use the __le32 type for it. Fixes: 6202783184bf ("nvmet: Improve nvmet_alloc_ctrl() interface and implementation") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index f4df458df9dbb..6a9af4e4d7325 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -582,7 +582,7 @@ struct nvmet_alloc_ctrl_args { const struct nvmet_fabrics_ops *ops; struct device *p2p_client; u32 kato; - u32 result; + __le32 result; u16 error_loc; u16 status; }; -- GitLab From cc3d4671a0db9499b201c43faba6c46e1a21274c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Jan 2025 08:55:34 +0100 Subject: [PATCH 055/989] nvmet: add a missing endianess conversion in nvmet_execute_admin_connect The kato field is little endian on the wire, but native endian in the in-core structure, add the missing byte swap. Fixes: 6202783184bf ("nvmet: Improve nvmet_alloc_ctrl() interface and implementation") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index a7ff05b3be29f..eb406c90c1679 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -287,7 +287,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) args.subsysnqn = d->subsysnqn; args.hostnqn = d->hostnqn; args.hostid = &d->hostid; - args.kato = c->kato; + args.kato = le32_to_cpu(c->kato); ctrl = nvmet_alloc_ctrl(&args); if (!ctrl) -- GitLab From 0e9724d0f89e8d77fa683e3129cadaed7c6e609d Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Mon, 20 Jan 2025 16:50:47 +0000 Subject: [PATCH 056/989] wifi: brcmfmac: use random seed flag for BCM4355 and BCM4364 firmware Before 6.13, random seed to the firmware was given based on the logic whether the device had valid OTP or not, and such devices were found mainly on the T2 and Apple Silicon Macs. In 6.13, the logic was changed, and the device table was used for this purpose, so as to cover the special case of BCM43752 chip. During the transition, the device table for BCM4364 and BCM4355 Wi-Fi chips which had valid OTP was not modified, thus breaking Wi-Fi on these devices. This patch adds does the necessary changes, similar to the ones done for other chips. Fixes: ea11a89c3ac6 ("wifi: brcmfmac: add flag for random seed during firmware download") Cc: stable@vger.kernel.org Signed-off-by: Aditya Garg Acked-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://patch.msgid.link/47E43F07-E11D-478C-86D4-23627154AC7C@live.com --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c index e4395b1f8c11e..d2caa80e94123 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c @@ -2712,7 +2712,7 @@ static const struct pci_device_id brcmf_pcie_devid_table[] = { BRCMF_PCIE_DEVICE(BRCM_PCIE_4350_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE_SUB(0x4355, BRCM_PCIE_VENDOR_ID_BROADCOM, 0x4355, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_4354_RAW_DEVICE_ID, WCC), - BRCMF_PCIE_DEVICE(BRCM_PCIE_4355_DEVICE_ID, WCC), + BRCMF_PCIE_DEVICE(BRCM_PCIE_4355_DEVICE_ID, WCC_SEED), BRCMF_PCIE_DEVICE(BRCM_PCIE_4356_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43567_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43570_DEVICE_ID, WCC), @@ -2723,7 +2723,7 @@ static const struct pci_device_id brcmf_pcie_devid_table[] = { BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_2G_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_5G_DEVICE_ID, WCC), BRCMF_PCIE_DEVICE(BRCM_PCIE_43602_RAW_DEVICE_ID, WCC), - BRCMF_PCIE_DEVICE(BRCM_PCIE_4364_DEVICE_ID, WCC), + BRCMF_PCIE_DEVICE(BRCM_PCIE_4364_DEVICE_ID, WCC_SEED), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_DEVICE_ID, BCA), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_2G_DEVICE_ID, BCA), BRCMF_PCIE_DEVICE(BRCM_PCIE_4365_5G_DEVICE_ID, BCA), -- GitLab From f4c9c2cc827d803159730b1da813a0c595969831 Mon Sep 17 00:00:00 2001 From: Remi Pommarel Date: Tue, 28 Jan 2025 16:11:06 +0100 Subject: [PATCH 057/989] batman-adv: Fix incorrect offset in batadv_tt_tvlv_ogm_handler_v1() Since commit 4436df478860 ("batman-adv: Add flex array to struct batadv_tvlv_tt_data"), the introduction of batadv_tvlv_tt_data's flex array member in batadv_tt_tvlv_ogm_handler_v1() put tt_changes at invalid offset. Those TT changes are supposed to be filled from the end of batadv_tvlv_tt_data structure (including vlan_data flexible array), but only the flex array size is taken into account missing completely the size of the fixed part of the structure itself. Fix the tt_change offset computation by using struct_size() instead of flex_array_size() so both flex array member and its container structure sizes are taken into account. Cc: stable@vger.kernel.org Fixes: 4436df478860 ("batman-adv: Add flex array to struct batadv_tvlv_tt_data") Signed-off-by: Remi Pommarel Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 760d51fdbdf60..7d5de4cbb814f 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3959,23 +3959,21 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; - size_t flex_size; + size_t tt_data_sz; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; - tvlv_value_len -= sizeof(*tt_data); - num_vlan = ntohs(tt_data->num_vlan); - flex_size = flex_array_size(tt_data, vlan_data, num_vlan); - if (tvlv_value_len < flex_size) + tt_data_sz = struct_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < tt_data_sz) return; tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data - + flex_size); - tvlv_value_len -= flex_size; + + tt_data_sz); + tvlv_value_len -= tt_data_sz; num_entries = batadv_tt_entries(tvlv_value_len); -- GitLab From b9a49520679e98700d3d89689cc91c08a1c88c1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jan 2025 00:55:32 +0100 Subject: [PATCH 058/989] rcuref: Plug slowpath race in rcuref_put() Kernel test robot reported an "imbalanced put" in the rcuref_put() slow path, which turned out to be a false positive. Consider the following race: ref = 0 (via rcuref_init(ref, 1)) T1 T2 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath(ref) rcuref_get(ref) -> atomic_add_negative_relaxed(1, &ref->refcnt) -> return true; # ref -> 0 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath() -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xffffffff / RCUREF_NOREF -> atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) # ref -> 0xe0000000 / RCUREF_DEAD -> return true -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xe0000000 / RCUREF_DEAD -> if (cnt > RCUREF_RELEASED) # 0xe0000000 > 0xc0000000 -> WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()") The problem is the additional read in the slow path (after it decremented to RCUREF_NOREF) which can happen after the counter has been marked RCUREF_DEAD. Prevent this by reusing the return value of the decrement. Now every "final" put uses RCUREF_NOREF in the slow path and attempts the final cmpxchg() to RCUREF_DEAD. [ bigeasy: Add changelog ] Fixes: ee1ee6db07795 ("atomics: Provide rcuref - scalable reference counting") Reported-by: kernel test robot Debugged-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/oe-lkp/202412311453.9d7636a2-lkp@intel.com --- include/linux/rcuref.h | 9 ++++++--- lib/rcuref.c | 5 ++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 2c8bfd0f1b6b3..6322d8c1c6b42 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref) return rcuref_get_slowpath(ref); } -extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { + int cnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ - if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + cnt = atomic_sub_return_release(1, &ref->refcnt); + if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ - return rcuref_put_slowpath(ref); + return rcuref_put_slowpath(ref, cnt); } /** diff --git a/lib/rcuref.c b/lib/rcuref.c index 97f300eca927c..5bd726b71e393 100644 --- a/lib/rcuref.c +++ b/lib/rcuref.c @@ -220,6 +220,7 @@ EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count + * @cnt: The resulting value of the fastpath decrement * * Invoked when the reference count is outside of the valid zone. * @@ -233,10 +234,8 @@ EXPORT_SYMBOL_GPL(rcuref_get_slowpath); * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ -bool rcuref_put_slowpath(rcuref_t *ref) +bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt) { - unsigned int cnt = atomic_read(&ref->refcnt); - /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* -- GitLab From 7332537962956fab2c055b37e5e2e6a0d2a8d6bf Mon Sep 17 00:00:00 2001 From: Jared Kangas Date: Tue, 21 Jan 2025 06:25:04 -0800 Subject: [PATCH 059/989] bpf: Remove unnecessary BTF lookups in bpf_sk_storage_tracing_allowed When loading BPF programs, bpf_sk_storage_tracing_allowed() does a series of lookups to get a type name from the program's attach_btf_id, making the assumption that the type is present in the vmlinux BTF along the way. However, this results in btf_type_by_id() returning a null pointer if a non-vmlinux kernel BTF is attached to. Proof-of-concept on a kernel with CONFIG_IPV6=m: $ cat bpfcrash.c #include #include #include static int bpf(enum bpf_cmd cmd, union bpf_attr *attr) { return syscall(__NR_bpf, cmd, attr, sizeof(*attr)); } int main(void) { const int btf_fd = bpf(BPF_BTF_GET_FD_BY_ID, &(union bpf_attr) { .btf_id = BTF_ID, }); if (btf_fd < 0) return 1; const int bpf_sk_storage_get = 107; const struct bpf_insn insns[] = { { .code = BPF_JMP | BPF_CALL, .imm = bpf_sk_storage_get}, { .code = BPF_JMP | BPF_EXIT }, }; return bpf(BPF_PROG_LOAD, &(union bpf_attr) { .prog_type = BPF_PROG_TYPE_TRACING, .expected_attach_type = BPF_TRACE_FENTRY, .license = (unsigned long)"GPL", .insns = (unsigned long)&insns, .insn_cnt = sizeof(insns) / sizeof(insns[0]), .attach_btf_obj_fd = btf_fd, .attach_btf_id = TYPE_ID, }); } $ sudo bpftool btf list | grep ipv6 2: name [ipv6] size 928200B $ sudo bpftool btf dump id 2 | awk '$3 ~ /inet6_sock_destruct/' [130689] FUNC 'inet6_sock_destruct' type_id=130677 linkage=static $ gcc -D_DEFAULT_SOURCE -DBTF_ID=2 -DTYPE_ID=130689 \ bpfcrash.c -o bpfcrash $ sudo ./bpfcrash This causes a null pointer dereference: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Call trace: bpf_sk_storage_tracing_allowed+0x8c/0xb0 P check_helper_call.isra.0+0xa8/0x1730 do_check+0xa18/0xb40 do_check_common+0x140/0x640 bpf_check+0xb74/0xcb8 bpf_prog_load+0x598/0x9a8 __sys_bpf+0x580/0x980 __arm64_sys_bpf+0x28/0x40 invoke_syscall.constprop.0+0x54/0xe8 do_el0_svc+0xb4/0xd0 el0_svc+0x44/0x1f8 el0t_64_sync_handler+0x13c/0x160 el0t_64_sync+0x184/0x188 Resolve this by using prog->aux->attach_func_name and removing the lookups. Fixes: 8e4597c627fb ("bpf: Allow using bpf_sk_storage in FENTRY/FEXIT/RAW_TP") Suggested-by: Martin KaFai Lau Signed-off-by: Jared Kangas Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250121142504.1369436-1-jkangas@redhat.com Signed-off-by: Alexei Starovoitov --- net/core/bpf_sk_storage.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 7d41cde1bcca6..2e538399757fe 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -355,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { - const struct btf *btf_vmlinux; - const struct btf_type *t; - const char *tname; - u32 btf_id; - if (prog->aux->dst_prog) return false; @@ -374,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - btf_vmlinux = bpf_get_btf_vmlinux(); - if (IS_ERR_OR_NULL(btf_vmlinux)) - return false; - btf_id = prog->aux->attach_btf_id; - t = btf_type_by_id(btf_vmlinux, btf_id); - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - return !!strncmp(tname, "bpf_sk_storage", + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; -- GitLab From 6b3d638ca897e099fa99bd6d02189d3176f80a47 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Wed, 22 Jan 2025 00:06:42 +0900 Subject: [PATCH 060/989] bpf, test_run: Fix use-after-free issue in eth_skb_pkt_type() KMSAN reported a use-after-free issue in eth_skb_pkt_type()[1]. The cause of the issue was that eth_skb_pkt_type() accessed skb's data that didn't contain an Ethernet header. This occurs when bpf_prog_test_run_xdp() passes an invalid value as the user_data argument to bpf_test_init(). Fix this by returning an error when user_data is less than ETH_HLEN in bpf_test_init(). Additionally, remove the check for "if (user_size > size)" as it is unnecessary. [1] BUG: KMSAN: use-after-free in eth_skb_pkt_type include/linux/etherdevice.h:627 [inline] BUG: KMSAN: use-after-free in eth_type_trans+0x4ee/0x980 net/ethernet/eth.c:165 eth_skb_pkt_type include/linux/etherdevice.h:627 [inline] eth_type_trans+0x4ee/0x980 net/ethernet/eth.c:165 __xdp_build_skb_from_frame+0x5a8/0xa50 net/core/xdp.c:635 xdp_recv_frames net/bpf/test_run.c:272 [inline] xdp_test_run_batch net/bpf/test_run.c:361 [inline] bpf_test_run_xdp_live+0x2954/0x3330 net/bpf/test_run.c:390 bpf_prog_test_run_xdp+0x148e/0x1b10 net/bpf/test_run.c:1318 bpf_prog_test_run+0x5b7/0xa30 kernel/bpf/syscall.c:4371 __sys_bpf+0x6a6/0xe20 kernel/bpf/syscall.c:5777 __do_sys_bpf kernel/bpf/syscall.c:5866 [inline] __se_sys_bpf kernel/bpf/syscall.c:5864 [inline] __x64_sys_bpf+0xa4/0xf0 kernel/bpf/syscall.c:5864 x64_sys_call+0x2ea0/0x3d90 arch/x86/include/generated/asm/syscalls_64.h:322 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd9/0x1d0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: free_pages_prepare mm/page_alloc.c:1056 [inline] free_unref_page+0x156/0x1320 mm/page_alloc.c:2657 __free_pages+0xa3/0x1b0 mm/page_alloc.c:4838 bpf_ringbuf_free kernel/bpf/ringbuf.c:226 [inline] ringbuf_map_free+0xff/0x1e0 kernel/bpf/ringbuf.c:235 bpf_map_free kernel/bpf/syscall.c:838 [inline] bpf_map_free_deferred+0x17c/0x310 kernel/bpf/syscall.c:862 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa2b/0x1b60 kernel/workqueue.c:3310 worker_thread+0xedf/0x1550 kernel/workqueue.c:3391 kthread+0x535/0x6b0 kernel/kthread.c:389 ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 CPU: 1 UID: 0 PID: 17276 Comm: syz.1.16450 Not tainted 6.12.0-05490-g9bb88c659673 #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 Fixes: be3d72a2896c ("bpf: move user_size out of bpf_test_init") Reported-by: syzkaller Suggested-by: Martin KaFai Lau Signed-off-by: Shigeru Yoshida Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20250121150643.671650-1-syoshida@redhat.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8f6f7db48d4e4..7cb192cbd65f3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -660,12 +660,9 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); - if (user_size > size) - return ERR_PTR(-EMSGSIZE); - size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) -- GitLab From c7f2188d68c114095660a950b7e880a1e5a71c8f Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Wed, 22 Jan 2025 00:06:43 +0900 Subject: [PATCH 061/989] selftests/bpf: Adjust data size to have ETH_HLEN The function bpf_test_init() now returns an error if user_size (.data_size_in) is less than ETH_HLEN, causing the tests to fail. Adjust the data size to ensure it meets the requirement of ETH_HLEN. Signed-off-by: Shigeru Yoshida Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250121150643.671650-2-syoshida@redhat.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c | 4 ++-- .../testing/selftests/bpf/prog_tests/xdp_devmap_attach.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index c7f74f068e788..df27535995af8 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -52,10 +52,10 @@ static void test_xdp_with_cpumap_helpers(void) ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id"); /* send a packet to trigger any potential bugs in there */ - char data[10] = {}; + char data[ETH_HLEN] = {}; DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data, - .data_size_in = 10, + .data_size_in = sizeof(data), .flags = BPF_F_TEST_XDP_LIVE_FRAMES, .repeat = 1, ); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index 27ffed17d4be3..461ab18705d5c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -23,7 +23,7 @@ static void test_xdp_with_devmap_helpers(void) __u32 len = sizeof(info); int err, dm_fd, dm_fd_redir, map_fd; struct nstoken *nstoken = NULL; - char data[10] = {}; + char data[ETH_HLEN] = {}; __u32 idx = 0; SYS(out_close, "ip netns add %s", TEST_NS); @@ -58,7 +58,7 @@ static void test_xdp_with_devmap_helpers(void) /* send a packet to trigger any potential bugs in there */ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data, - .data_size_in = 10, + .data_size_in = sizeof(data), .flags = BPF_F_TEST_XDP_LIVE_FRAMES, .repeat = 1, ); @@ -158,7 +158,7 @@ static void test_xdp_with_devmap_helpers_veth(void) struct nstoken *nstoken = NULL; __u32 len = sizeof(info); int err, dm_fd, dm_fd_redir, map_fd, ifindex_dst; - char data[10] = {}; + char data[ETH_HLEN] = {}; __u32 idx = 0; SYS(out_close, "ip netns add %s", TEST_NS); @@ -208,7 +208,7 @@ static void test_xdp_with_devmap_helpers_veth(void) /* send a packet to trigger any potential bugs in there */ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, .data_in = &data, - .data_size_in = 10, + .data_size_in = sizeof(data), .flags = BPF_F_TEST_XDP_LIVE_FRAMES, .repeat = 1, ); -- GitLab From 98671a0fd1f14e4a518ee06b19037c20014900eb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Jan 2025 17:22:45 -0800 Subject: [PATCH 062/989] bpf: unify VM_WRITE vs VM_MAYWRITE use in BPF map mmaping logic For all BPF maps we ensure that VM_MAYWRITE is cleared when memory-mapping BPF map contents as initially read-only VMA. This is because in some cases BPF verifier relies on the underlying data to not be modified afterwards by user space, so once something is mapped read-only, it shouldn't be re-mmap'ed as read-write. As such, it's not necessary to check VM_MAYWRITE in bpf_map_mmap() and map->ops->map_mmap() callbacks: VM_WRITE should be consistently set for read-write mappings, and if VM_WRITE is not set, there is no way for user space to upgrade read-only mapping to read-write one. This patch cleans up this VM_WRITE vs VM_MAYWRITE handling within bpf_map_mmap(), which is an entry point for any BPF map mmap()-ing logic. We also drop unnecessary sanitization of VM_MAYWRITE in BPF ringbuf's map_mmap() callback implementation, as it is already performed by common code in bpf_map_mmap(). Note, though, that in bpf_map_mmap_{open,close}() callbacks we can't drop VM_MAYWRITE use, because it's possible (and is outside of subsystem's control) to have initially read-write memory mapping, which is subsequently dropped to read-only by user space through mprotect(). In such case, from BPF verifier POV it's read-write data throughout the lifetime of BPF map, and is counted as "active writer". But its VMAs will start out as VM_WRITE|VM_MAYWRITE, then mprotect() can change it to just VM_MAYWRITE (and no VM_WRITE), so when its finally munmap()'ed and bpf_map_mmap_close() is called, vm_flags will be just VM_MAYWRITE, but we still need to decrement active writer count with bpf_map_write_active_dec() as it's still considered to be a read-write mapping by the rest of BPF subsystem. Similar reasoning applies to bpf_map_mmap_open(), which is called whenever mmap(), munmap(), and/or mprotect() forces mm subsystem to split original VMA into multiple discontiguous VMAs. Memory-mapping handling is a bit tricky, yes. Cc: Jann Horn Cc: Suren Baghdasaryan Cc: Shakeel Butt Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250129012246.1515826-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/ringbuf.c | 4 ---- kernel/bpf/syscall.c | 10 ++++++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e1cfe890e0be6..1499d8caa9a35 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -268,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -289,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma * position, and the ring buffer data itself. */ return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0daf098e32074..9bec3dce421f0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1065,15 +1065,21 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); + /* If mapping is read-only, then disallow potentially re-mapping with + * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing + * means that as far as BPF map's memory-mapped VMAs are concerned, + * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, + * both should be set, so we can forget about VM_MAYWRITE and always + * check just VM_WRITE + */ if (!(vma->vm_flags & VM_WRITE)) - /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) goto out; - if (vma->vm_flags & VM_MAYWRITE) + if (vma->vm_flags & VM_WRITE) bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); -- GitLab From bc27c52eea189e8f7492d40739b7746d67b65beb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 28 Jan 2025 17:22:46 -0800 Subject: [PATCH 063/989] bpf: avoid holding freeze_mutex during mmap operation We use map->freeze_mutex to prevent races between map_freeze() and memory mapping BPF map contents with writable permissions. The way we naively do this means we'll hold freeze_mutex for entire duration of all the mm and VMA manipulations, which is completely unnecessary. This can potentially also lead to deadlocks, as reported by syzbot in [0]. So, instead, hold freeze_mutex only during writeability checks, bump (proactively) "write active" count for the map, unlock the mutex and proceed with mmap logic. And only if something went wrong during mmap logic, then undo that "write active" counter increment. [0] https://lore.kernel.org/bpf/678dcbc9.050a0220.303755.0066.GAE@google.com/ Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Reported-by: syzbot+4dc041c686b7c816a71e@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20250129012246.1515826-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9bec3dce421f0..14d6e99459d32 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1035,7 +1035,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; @@ -1059,7 +1059,12 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; @@ -1076,13 +1081,11 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_WRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } -- GitLab From e1e17a1715982201034024863efbf238bee2bdf9 Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Mon, 11 Mar 2024 16:21:22 +0530 Subject: [PATCH 064/989] firmware: iscsi_ibft: fix ISCSI_IBFT Kconfig entry Fix ISCSI_IBFT Kconfig entry, replace tab with a space character. Fixes: 138fe4e0697 ("Firmware: add iSCSI iBFT Support") Signed-off-by: Prasad Pandit Signed-off-by: Konrad Rzeszutek Wilk --- drivers/firmware/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig index 71d8b26c4103b..9f35f69e0f9e2 100644 --- a/drivers/firmware/Kconfig +++ b/drivers/firmware/Kconfig @@ -106,7 +106,7 @@ config ISCSI_IBFT select ISCSI_BOOT_SYSFS select ISCSI_IBFT_FIND if X86 depends on ACPI && SCSI && SCSI_LOWLEVEL - default n + default n help This option enables support for detection and exposing of iSCSI Boot Firmware Table (iBFT) via sysfs to userspace. If you wish to -- GitLab From 07e0d99a2f701123ad3104c0f1a1e66bce74d6e5 Mon Sep 17 00:00:00 2001 From: Chengen Du Date: Tue, 14 Jan 2025 12:12:34 +0800 Subject: [PATCH 065/989] iscsi_ibft: Fix UBSAN shift-out-of-bounds warning in ibft_attr_show_nic() When performing an iSCSI boot using IPv6, iscsistart still reads the /sys/firmware/ibft/ethernetX/subnet-mask entry. Since the IPv6 prefix length is 64, this causes the shift exponent to become negative, triggering a UBSAN warning. As the concept of a subnet mask does not apply to IPv6, the value is set to ~0 to suppress the warning message. Signed-off-by: Chengen Du Signed-off-by: Konrad Rzeszutek Wilk --- drivers/firmware/iscsi_ibft.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index 6e9788324fea5..371f24569b3b2 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -310,7 +310,10 @@ static ssize_t ibft_attr_show_nic(void *data, int type, char *buf) str += sprintf_ipaddr(str, nic->ip_addr); break; case ISCSI_BOOT_ETH_SUBNET_MASK: - val = cpu_to_be32(~((1 << (32-nic->subnet_mask_prefix))-1)); + if (nic->subnet_mask_prefix > 32) + val = cpu_to_be32(~0); + else + val = cpu_to_be32(~((1 << (32-nic->subnet_mask_prefix))-1)); str += sprintf(str, "%pI4", &val); break; case ISCSI_BOOT_ETH_PREFIX_LEN: -- GitLab From 79fc672a092d93a7eac24fe20a571d4efd8fa5a4 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Thu, 19 Dec 2024 17:02:56 +0800 Subject: [PATCH 066/989] drm/komeda: Add check for komeda_get_layer_fourcc_list() Add check for the return value of komeda_get_layer_fourcc_list() to catch the potential exception. Fixes: 5d51f6c0da1b ("drm/komeda: Add writeback support") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Acked-by: Liviu Dudau Link: https://lore.kernel.org/r/20241219090256.146424-1-haoxiang_li2024@163.com Signed-off-by: Liviu Dudau --- drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c index ebccb74306a76..f30b3d5eeca5c 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c @@ -160,6 +160,10 @@ static int komeda_wb_connector_add(struct komeda_kms_dev *kms, formats = komeda_get_layer_fourcc_list(&mdev->fmt_tbl, kwb_conn->wb_layer->layer_type, &n_formats); + if (!formats) { + kfree(kwb_conn); + return -ENOMEM; + } err = drm_writeback_connector_init(&kms->base, wb_conn, &komeda_wb_connector_funcs, -- GitLab From 0532a79efd68a4d9686b0385e4993af4b130ff82 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:13 +0800 Subject: [PATCH 067/989] strparser: Add read_sock callback Added a new read_sock handler, allowing users to customize read operations instead of relying on the native socket's read_sock. Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-2-mrpre@163.com --- Documentation/networking/strparser.rst | 9 ++++++++- include/net/strparser.h | 2 ++ net/strparser/strparser.c | 11 +++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/strparser.rst b/Documentation/networking/strparser.rst index 6cab1f74ae05a..7f623d1db72aa 100644 --- a/Documentation/networking/strparser.rst +++ b/Documentation/networking/strparser.rst @@ -112,7 +112,7 @@ Functions Callbacks ========= -There are six callbacks: +There are seven callbacks: :: @@ -182,6 +182,13 @@ There are six callbacks: the length of the message. skb->len - offset may be greater then full_len since strparser does not trim the skb. + :: + + int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); + + The read_sock callback is used by strparser instead of + sock->ops->read_sock, if provided. :: int (*read_sock_done)(struct strparser *strp, int err); diff --git a/include/net/strparser.h b/include/net/strparser.h index 41e2ce9e9e10f..0a83010b3a64a 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -43,6 +43,8 @@ struct strparser; struct strp_callbacks { int (*parse_msg)(struct strparser *strp, struct sk_buff *skb); void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb); + int (*read_sock)(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); int (*read_sock_done)(struct strparser *strp, int err); void (*abort_parser)(struct strparser *strp, int err); void (*lock)(struct strparser *strp); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 8299ceb3e3739..95696f42647ec 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -347,7 +347,10 @@ static int strp_read_sock(struct strparser *strp) struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops)) + return -EBUSY; + + if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock)) return -EBUSY; desc.arg.data = strp; @@ -355,7 +358,10 @@ static int strp_read_sock(struct strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, strp_recv); + if (strp->cb.read_sock) + strp->cb.read_sock(strp, &desc, strp_recv); + else + sock->ops->read_sock(strp->sk, &desc, strp_recv); desc.error = strp->cb.read_sock_done(strp, desc.error); @@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk, strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock = cb->read_sock; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; -- GitLab From 36b62df5683c315ba58c950f1a9c771c796c30ec Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:14 +0800 Subject: [PATCH 068/989] bpf: Fix wrong copied_seq calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'sk->copied_seq' was updated in the tcp_eat_skb() function when the action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. It works for a single stream_verdict scenario, as it also modified sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb to remove updating 'sk->copied_seq'. However, for programs where both stream_parser and stream_verdict are active (strparser purpose), tcp_read_sock() was used instead of tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock). tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicate updates. In summary, for strparser + SK_PASS, copied_seq is redundantly calculated in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). The issue causes incorrect copied_seq calculations, which prevent correct data reads from the recv() interface in user-land. We do not want to add new proto_ops to implement a new version of tcp_read_sock, as this would introduce code complexity [1]. We could have added noack and copied_seq to desc, and then called ops->read_sock. However, unfortunately, other modules didn’t fully initialize desc to zero. So, for now, we are directly calling tcp_read_sock_noack() in tcp_bpf.c. [1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Suggested-by: Jakub Sitnicki Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-3-mrpre@163.com --- include/linux/skmsg.h | 2 ++ include/net/tcp.h | 8 ++++++++ net/core/skmsg.c | 7 +++++++ net/ipv4/tcp.c | 29 ++++++++++++++++++++++++----- net/ipv4/tcp_bpf.c | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2cbe0c22a32f3..0b9095a281b89 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -91,6 +91,8 @@ struct sk_psock { struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; + u32 copied_seq; + u32 ingress_bytes; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688f..9c044fb9ab26e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); @@ -2599,6 +2602,11 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 61f3f3d4e5285..0ddc4c7188332 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; @@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c416..285678d8ce077 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb); * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); } - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 47f65b1b70ca2..ba581785adb4b 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sock *sk = strp->sk; + struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; + + tp = tcp_sk(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: + rcu_read_unlock(); + return copied; +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; -- GitLab From 5459cce6bf49e72ee29be21865869c2ac42419f5 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:15 +0800 Subject: [PATCH 069/989] bpf: Disable non stream socket for strparser Currently, only TCP supports strparser, but sockmap doesn't intercept non-TCP connections to attach strparser. For example, with UDP, although the read/write handlers are replaced, strparser is not executed due to the lack of a read_sock operation. Furthermore, in udp_bpf_recvmsg(), it checks whether the psock has data, and if not, it falls back to the native UDP read interface, making UDP + strparser appear to read correctly. According to its commit history, this behavior is unexpected. Moreover, since UDP lacks the concept of streams, we intercept it directly. Fixes: 1fa1fe8ff161 ("bpf, sockmap: Test shutdown() correctly exits epoll and recv()=0") Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-4-mrpre@163.com --- net/core/sock_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792c..3b0f59d9b4db8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { - ret = sk_psock_init_strp(sk, psock); + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); -- GitLab From a0c11149509aa905aeec10cf9998091443472b0b Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:16 +0800 Subject: [PATCH 070/989] selftests/bpf: Fix invalid flag of recv() SOCK_NONBLOCK flag is only effective during socket creation, not during recv. Use MSG_DONTWAIT instead. Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-5-mrpre@163.com --- tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 884ad87783d59..0c51b7288978e 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -522,8 +522,8 @@ static void test_sockmap_skb_verdict_shutdown(void) if (!ASSERT_EQ(err, 1, "epoll_wait(fd)")) goto out_close; - n = recv(c1, &b, 1, SOCK_NONBLOCK); - ASSERT_EQ(n, 0, "recv_timeout(fin)"); + n = recv(c1, &b, 1, MSG_DONTWAIT); + ASSERT_EQ(n, 0, "recv(fin)"); out_close: close(c1); close(p1); @@ -628,7 +628,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); /* On DROP test there will be no data to read */ if (pass_prog) { - recvd = recv_timeout(c1, &buf, sizeof(buf), SOCK_NONBLOCK, IO_TIMEOUT_SEC); + recvd = recv_timeout(c1, &buf, sizeof(buf), MSG_DONTWAIT, IO_TIMEOUT_SEC); ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(c0)"); } -- GitLab From 6fcfe96e0f6e9bebe1b185f1548a9a8cb1b68dea Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:17 +0800 Subject: [PATCH 071/989] selftests/bpf: Add strparser test for bpf Add test cases for bpf + strparser and separated them from sockmap_basic, as strparser has more encapsulation and parsing capabilities compared to standard sockmap. Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-6-mrpre@163.com --- .../selftests/bpf/prog_tests/sockmap_basic.c | 53 -- .../selftests/bpf/prog_tests/sockmap_strp.c | 454 ++++++++++++++++++ .../selftests/bpf/progs/test_sockmap_strp.c | 53 ++ 3 files changed, 507 insertions(+), 53 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/sockmap_strp.c create mode 100644 tools/testing/selftests/bpf/progs/test_sockmap_strp.c diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 0c51b7288978e..f8953455db29f 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -531,57 +531,6 @@ static void test_sockmap_skb_verdict_shutdown(void) test_sockmap_pass_prog__destroy(skel); } -static void test_sockmap_stream_pass(void) -{ - int zero = 0, sent, recvd; - int verdict, parser; - int err, map; - int c = -1, p = -1; - struct test_sockmap_pass_prog *pass = NULL; - char snd[256] = "0123456789"; - char rcv[256] = "0"; - - pass = test_sockmap_pass_prog__open_and_load(); - verdict = bpf_program__fd(pass->progs.prog_skb_verdict); - parser = bpf_program__fd(pass->progs.prog_skb_parser); - map = bpf_map__fd(pass->maps.sock_map_rx); - - err = bpf_prog_attach(parser, map, BPF_SK_SKB_STREAM_PARSER, 0); - if (!ASSERT_OK(err, "bpf_prog_attach stream parser")) - goto out; - - err = bpf_prog_attach(verdict, map, BPF_SK_SKB_STREAM_VERDICT, 0); - if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) - goto out; - - err = create_pair(AF_INET, SOCK_STREAM, &c, &p); - if (err) - goto out; - - /* sk_data_ready of 'p' will be replaced by strparser handler */ - err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); - if (!ASSERT_OK(err, "bpf_map_update_elem(p)")) - goto out_close; - - /* - * as 'prog_skb_parser' return the original skb len and - * 'prog_skb_verdict' return SK_PASS, the kernel will just - * pass it through to original socket 'p' - */ - sent = xsend(c, snd, sizeof(snd), 0); - ASSERT_EQ(sent, sizeof(snd), "xsend(c)"); - - recvd = recv_timeout(p, rcv, sizeof(rcv), SOCK_NONBLOCK, - IO_TIMEOUT_SEC); - ASSERT_EQ(recvd, sizeof(rcv), "recv_timeout(p)"); - -out_close: - close(c); - close(p); - -out: - test_sockmap_pass_prog__destroy(pass); -} static void test_sockmap_skb_verdict_fionread(bool pass_prog) { @@ -1101,8 +1050,6 @@ void test_sockmap_basic(void) test_sockmap_progs_query(BPF_SK_SKB_VERDICT); if (test__start_subtest("sockmap skb_verdict shutdown")) test_sockmap_skb_verdict_shutdown(); - if (test__start_subtest("sockmap stream parser and verdict pass")) - test_sockmap_stream_pass(); if (test__start_subtest("sockmap skb_verdict fionread")) test_sockmap_skb_verdict_fionread(true); if (test__start_subtest("sockmap skb_verdict fionread on drop")) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c new file mode 100644 index 0000000000000..621b3b71888ef --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_strp.c @@ -0,0 +1,454 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "sockmap_helpers.h" +#include "test_skmsg_load_helpers.skel.h" +#include "test_sockmap_strp.skel.h" + +#define STRP_PKT_HEAD_LEN 4 +#define STRP_PKT_BODY_LEN 6 +#define STRP_PKT_FULL_LEN (STRP_PKT_HEAD_LEN + STRP_PKT_BODY_LEN) + +static const char packet[STRP_PKT_FULL_LEN] = "head+body\0"; +static const int test_packet_num = 100; + +/* Current implementation of tcp_bpf_recvmsg_parser() invokes data_ready + * with sk held if an skb exists in sk_receive_queue. Then for the + * data_ready implementation of strparser, it will delay the read + * operation if sk is held and EAGAIN is returned. + */ +static int sockmap_strp_consume_pre_data(int p) +{ + int recvd; + bool retried = false; + char rcv[10]; + +retry: + errno = 0; + recvd = recv_timeout(p, rcv, sizeof(rcv), 0, 1); + if (recvd < 0 && errno == EAGAIN && retried == false) { + /* On the first call, EAGAIN will certainly be returned. + * A 1-second wait is enough for the workqueue to finish. + */ + sleep(1); + retried = true; + goto retry; + } + + if (!ASSERT_EQ(recvd, STRP_PKT_FULL_LEN, "recv error or truncated data") || + !ASSERT_OK(memcmp(packet, rcv, STRP_PKT_FULL_LEN), + "data mismatch")) + return -1; + return 0; +} + +static struct test_sockmap_strp *sockmap_strp_init(int *out_map, bool pass, + bool need_parser) +{ + struct test_sockmap_strp *strp = NULL; + int verdict, parser; + int err; + + strp = test_sockmap_strp__open_and_load(); + *out_map = bpf_map__fd(strp->maps.sock_map); + + if (need_parser) + parser = bpf_program__fd(strp->progs.prog_skb_parser_partial); + else + parser = bpf_program__fd(strp->progs.prog_skb_parser); + + if (pass) + verdict = bpf_program__fd(strp->progs.prog_skb_verdict_pass); + else + verdict = bpf_program__fd(strp->progs.prog_skb_verdict); + + err = bpf_prog_attach(parser, *out_map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream parser")) + goto err; + + err = bpf_prog_attach(verdict, *out_map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach stream verdict")) + goto err; + + return strp; +err: + test_sockmap_strp__destroy(strp); + return NULL; +} + +/* Dispatch packets to different socket by packet size: + * + * ------ ------ + * | pkt4 || pkt1 |... > remote socket + * ------ ------ / ------ ------ + * | pkt8 | pkt7 |... + * ------ ------ \ ------ ------ + * | pkt3 || pkt2 |... > local socket + * ------ ------ + */ +static void test_sockmap_strp_dispatch_pkt(int family, int sotype) +{ + int i, j, zero = 0, one = 1, recvd; + int err, map; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct test_sockmap_strp *strp = NULL; + int test_cnt = 6; + char rcv[10]; + struct { + char data[7]; + int data_len; + int send_cnt; + int *receiver; + } send_dir[2] = { + /* data expected to deliver to local */ + {"llllll", 6, 0, &p0}, + /* data expected to deliver to remote */ + {"rrrrr", 5, 0, &c1} + }; + + strp = sockmap_strp_init(&map, false, false); + if (!ASSERT_TRUE(strp, "sockmap_strp_init")) + return; + + err = create_socket_pairs(family, sotype, &c0, &c1, &p0, &p1); + if (!ASSERT_OK(err, "create_socket_pairs()")) + goto out; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto out_close; + + err = bpf_map_update_elem(map, &one, &p1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto out_close; + + err = setsockopt(c1, IPPROTO_TCP, TCP_NODELAY, &zero, sizeof(zero)); + if (!ASSERT_OK(err, "setsockopt(TCP_NODELAY)")) + goto out_close; + + /* deliver data with data size greater than 5 to local */ + strp->data->verdict_max_size = 5; + + for (i = 0; i < test_cnt; i++) { + int d = i % 2; + + xsend(c0, send_dir[d].data, send_dir[d].data_len, 0); + send_dir[d].send_cnt++; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < send_dir[i].send_cnt; j++) { + int expected = send_dir[i].data_len; + + recvd = recv_timeout(*send_dir[i].receiver, rcv, + expected, MSG_DONTWAIT, + IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, expected, "recv_timeout()")) + goto out_close; + if (!ASSERT_OK(memcmp(send_dir[i].data, rcv, recvd), + "data mismatch")) + goto out_close; + } + } +out_close: + close(c0); + close(c1); + close(p0); + close(p1); +out: + test_sockmap_strp__destroy(strp); +} + +/* We have multiple packets in one skb + * ------------ ------------ ------------ + * | packet1 | packet2 | ... + * ------------ ------------ ------------ + */ +static void test_sockmap_strp_multiple_pkt(int family, int sotype) +{ + int i, zero = 0; + int sent, recvd, total; + int err, map; + int c = -1, p = -1; + struct test_sockmap_strp *strp = NULL; + char *snd = NULL, *rcv = NULL; + + strp = sockmap_strp_init(&map, true, true); + if (!ASSERT_TRUE(strp, "sockmap_strp_init")) + return; + + err = create_pair(family, sotype, &c, &p); + if (err) + goto out; + + err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(zero, p)")) + goto out_close; + + /* construct multiple packets in one buffer */ + total = test_packet_num * STRP_PKT_FULL_LEN; + snd = malloc(total); + rcv = malloc(total + 1); + if (!ASSERT_TRUE(snd, "malloc(snd)") || + !ASSERT_TRUE(rcv, "malloc(rcv)")) + goto out_close; + + for (i = 0; i < test_packet_num; i++) { + memcpy(snd + i * STRP_PKT_FULL_LEN, + packet, STRP_PKT_FULL_LEN); + } + + sent = xsend(c, snd, total, 0); + if (!ASSERT_EQ(sent, total, "xsend(c)")) + goto out_close; + + /* try to recv one more byte to avoid truncation check */ + recvd = recv_timeout(p, rcv, total + 1, MSG_DONTWAIT, IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, total, "recv(rcv)")) + goto out_close; + + /* we sent TCP segment with multiple encapsulation + * then check whether packets are handled correctly + */ + if (!ASSERT_OK(memcmp(snd, rcv, total), "data mismatch")) + goto out_close; + +out_close: + close(c); + close(p); + if (snd) + free(snd); + if (rcv) + free(rcv); +out: + test_sockmap_strp__destroy(strp); +} + +/* Test strparser with partial read */ +static void test_sockmap_strp_partial_read(int family, int sotype) +{ + int zero = 0, recvd, off; + int err, map; + int c = -1, p = -1; + struct test_sockmap_strp *strp = NULL; + char rcv[STRP_PKT_FULL_LEN + 1] = "0"; + + strp = sockmap_strp_init(&map, true, true); + if (!ASSERT_TRUE(strp, "sockmap_strp_init")) + return; + + err = create_pair(family, sotype, &c, &p); + if (err) + goto out; + + /* sk_data_ready of 'p' will be replaced by strparser handler */ + err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(zero, p)")) + goto out_close; + + /* 1.1 send partial head, 1 byte header left */ + off = STRP_PKT_HEAD_LEN - 1; + xsend(c, packet, off, 0); + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(-1, recvd, "partial head sent, expected no data")) + goto out_close; + + /* 1.2 send remaining head and body */ + xsend(c, packet + off, STRP_PKT_FULL_LEN - off, 0); + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, STRP_PKT_FULL_LEN, "expected full data")) + goto out_close; + + /* 2.1 send partial head, 1 byte header left */ + off = STRP_PKT_HEAD_LEN - 1; + xsend(c, packet, off, 0); + + /* 2.2 send remaining head and partial body, 1 byte body left */ + xsend(c, packet + off, STRP_PKT_FULL_LEN - off - 1, 0); + off = STRP_PKT_FULL_LEN - 1; + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(-1, recvd, "partial body sent, expected no data")) + goto out_close; + + /* 2.3 send remaining body */ + xsend(c, packet + off, STRP_PKT_FULL_LEN - off, 0); + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, STRP_PKT_FULL_LEN, "expected full data")) + goto out_close; + +out_close: + close(c); + close(p); + +out: + test_sockmap_strp__destroy(strp); +} + +/* Test simple socket read/write with strparser + FIONREAD */ +static void test_sockmap_strp_pass(int family, int sotype, bool fionread) +{ + int zero = 0, pkt_size = STRP_PKT_FULL_LEN, sent, recvd, avail; + int err, map; + int c = -1, p = -1; + int test_cnt = 10, i; + struct test_sockmap_strp *strp = NULL; + char rcv[STRP_PKT_FULL_LEN + 1] = "0"; + + strp = sockmap_strp_init(&map, true, true); + if (!ASSERT_TRUE(strp, "sockmap_strp_init")) + return; + + err = create_pair(family, sotype, &c, &p); + if (err) + goto out; + + /* inject some data before bpf process, it should be read + * correctly because we check sk_receive_queue in + * tcp_bpf_recvmsg_parser(). + */ + sent = xsend(c, packet, pkt_size, 0); + if (!ASSERT_EQ(sent, pkt_size, "xsend(pre-data)")) + goto out_close; + + /* sk_data_ready of 'p' will be replaced by strparser handler */ + err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p)")) + goto out_close; + + /* consume previous data we injected */ + if (sockmap_strp_consume_pre_data(p)) + goto out_close; + + /* Previously, we encountered issues such as deadlocks and + * sequence errors that resulted in the inability to read + * continuously. Therefore, we perform multiple iterations + * of testing here. + */ + for (i = 0; i < test_cnt; i++) { + sent = xsend(c, packet, pkt_size, 0); + if (!ASSERT_EQ(sent, pkt_size, "xsend(c)")) + goto out_close; + + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, + IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, pkt_size, "recv_timeout(p)") || + !ASSERT_OK(memcmp(packet, rcv, pkt_size), + "memcmp, data mismatch")) + goto out_close; + } + + if (fionread) { + sent = xsend(c, packet, pkt_size, 0); + if (!ASSERT_EQ(sent, pkt_size, "second xsend(c)")) + goto out_close; + + err = ioctl(p, FIONREAD, &avail); + if (!ASSERT_OK(err, "ioctl(FIONREAD) error") || + !ASSERT_EQ(avail, pkt_size, "ioctl(FIONREAD)")) + goto out_close; + + recvd = recv_timeout(p, rcv, sizeof(rcv), MSG_DONTWAIT, + IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, pkt_size, "second recv_timeout(p)") || + !ASSERT_OK(memcmp(packet, rcv, pkt_size), + "second memcmp, data mismatch")) + goto out_close; + } + +out_close: + close(c); + close(p); + +out: + test_sockmap_strp__destroy(strp); +} + +/* Test strparser with verdict mode */ +static void test_sockmap_strp_verdict(int family, int sotype) +{ + int zero = 0, one = 1, sent, recvd, off; + int err, map; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct test_sockmap_strp *strp = NULL; + char rcv[STRP_PKT_FULL_LEN + 1] = "0"; + + strp = sockmap_strp_init(&map, false, true); + if (!ASSERT_TRUE(strp, "sockmap_strp_init")) + return; + + /* We simulate a reverse proxy server. + * When p0 receives data from c0, we forward it to c1. + * From c1's perspective, it will consider this data + * as being sent by p1. + */ + err = create_socket_pairs(family, sotype, &c0, &c1, &p0, &p1); + if (!ASSERT_OK(err, "create_socket_pairs()")) + goto out; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto out_close; + + err = bpf_map_update_elem(map, &one, &p1, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto out_close; + + sent = xsend(c0, packet, STRP_PKT_FULL_LEN, 0); + if (!ASSERT_EQ(sent, STRP_PKT_FULL_LEN, "xsend(c0)")) + goto out_close; + + recvd = recv_timeout(c1, rcv, sizeof(rcv), MSG_DONTWAIT, + IO_TIMEOUT_SEC); + if (!ASSERT_EQ(recvd, STRP_PKT_FULL_LEN, "recv_timeout(c1)") || + !ASSERT_OK(memcmp(packet, rcv, STRP_PKT_FULL_LEN), + "received data does not match the sent data")) + goto out_close; + + /* send again to ensure the stream is functioning correctly. */ + sent = xsend(c0, packet, STRP_PKT_FULL_LEN, 0); + if (!ASSERT_EQ(sent, STRP_PKT_FULL_LEN, "second xsend(c0)")) + goto out_close; + + /* partial read */ + off = STRP_PKT_FULL_LEN / 2; + recvd = recv_timeout(c1, rcv, off, MSG_DONTWAIT, + IO_TIMEOUT_SEC); + recvd += recv_timeout(c1, rcv + off, sizeof(rcv) - off, MSG_DONTWAIT, + IO_TIMEOUT_SEC); + + if (!ASSERT_EQ(recvd, STRP_PKT_FULL_LEN, "partial recv_timeout(c1)") || + !ASSERT_OK(memcmp(packet, rcv, STRP_PKT_FULL_LEN), + "partial received data does not match the sent data")) + goto out_close; + +out_close: + close(c0); + close(c1); + close(p0); + close(p1); +out: + test_sockmap_strp__destroy(strp); +} + +void test_sockmap_strp(void) +{ + if (test__start_subtest("sockmap strp tcp pass")) + test_sockmap_strp_pass(AF_INET, SOCK_STREAM, false); + if (test__start_subtest("sockmap strp tcp v6 pass")) + test_sockmap_strp_pass(AF_INET6, SOCK_STREAM, false); + if (test__start_subtest("sockmap strp tcp pass fionread")) + test_sockmap_strp_pass(AF_INET, SOCK_STREAM, true); + if (test__start_subtest("sockmap strp tcp v6 pass fionread")) + test_sockmap_strp_pass(AF_INET6, SOCK_STREAM, true); + if (test__start_subtest("sockmap strp tcp verdict")) + test_sockmap_strp_verdict(AF_INET, SOCK_STREAM); + if (test__start_subtest("sockmap strp tcp v6 verdict")) + test_sockmap_strp_verdict(AF_INET6, SOCK_STREAM); + if (test__start_subtest("sockmap strp tcp partial read")) + test_sockmap_strp_partial_read(AF_INET, SOCK_STREAM); + if (test__start_subtest("sockmap strp tcp multiple packets")) + test_sockmap_strp_multiple_pkt(AF_INET, SOCK_STREAM); + if (test__start_subtest("sockmap strp tcp dispatch")) + test_sockmap_strp_dispatch_pkt(AF_INET, SOCK_STREAM); +} diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_strp.c b/tools/testing/selftests/bpf/progs/test_sockmap_strp.c new file mode 100644 index 0000000000000..dde3d5bec5154 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_strp.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +int verdict_max_size = 10000; +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 20); + __type(key, int); + __type(value, int); +} sock_map SEC(".maps"); + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + __u32 one = 1; + + if (skb->len > verdict_max_size) + return SK_PASS; + + return bpf_sk_redirect_map(skb, &sock_map, one, 0); +} + +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_pass(struct __sk_buff *skb) +{ + return SK_PASS; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser(struct __sk_buff *skb) +{ + return skb->len; +} + +SEC("sk_skb/stream_parser") +int prog_skb_parser_partial(struct __sk_buff *skb) +{ + /* agreement with the test program on a 4-byte size header + * and 6-byte body. + */ + if (skb->len < 4) { + /* need more header to determine full length */ + return 0; + } + /* return full length decoded from header. + * the return value may be larger than skb->len which + * means framework must wait body coming. + */ + return 10; +} + +char _license[] SEC("license") = "GPL"; -- GitLab From c78f4afbd962f43a3989f45f3ca04300252b19b5 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Sat, 21 Dec 2024 14:10:16 +0800 Subject: [PATCH 072/989] bpf: Fix deadlock when freeing cgroup storage The following commit bc235cdb423a ("bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete]") first introduced deadlock prevention for fentry/fexit programs attaching on bpf_task_storage helpers. That commit also employed the logic in map free path in its v6 version. Later bpf_cgrp_storage was first introduced in c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") which faces the same issue as bpf_task_storage, instead of its busy counter, NULL was passed to bpf_local_storage_map_free() which opened a window to cause deadlock: (acquiring local_storage->lock) _raw_spin_lock_irqsave+0x3d/0x50 bpf_local_storage_update+0xd1/0x460 bpf_cgrp_storage_get+0x109/0x130 bpf_prog_a4d4a370ba857314_cgrp_ptr+0x139/0x170 ? __bpf_prog_enter_recur+0x16/0x80 bpf_trampoline_6442485186+0x43/0xa4 cgroup_storage_ptr+0x9/0x20 (holding local_storage->lock) bpf_selem_unlink_storage_nolock.constprop.0+0x135/0x160 bpf_selem_unlink_storage+0x6f/0x110 bpf_local_storage_map_free+0xa2/0x110 bpf_map_free_deferred+0x5b/0x90 process_one_work+0x17c/0x390 worker_thread+0x251/0x360 kthread+0xd2/0x100 ret_from_fork+0x34/0x50 ret_from_fork_asm+0x1a/0x30 Progs: - A: SEC("fentry/cgroup_storage_ptr") - cgid (BPF_MAP_TYPE_HASH) Record the id of the cgroup the current task belonging to in this hash map, using the address of the cgroup as the map key. - cgrpa (BPF_MAP_TYPE_CGRP_STORAGE) If current task is a kworker, lookup the above hash map using function parameter @owner as the key to get its corresponding cgroup id which is then used to get a trusted pointer to the cgroup through bpf_cgroup_from_id(). This trusted pointer can then be passed to bpf_cgrp_storage_get() to finally trigger the deadlock issue. - B: SEC("tp_btf/sys_enter") - cgrpb (BPF_MAP_TYPE_CGRP_STORAGE) The only purpose of this prog is to fill Prog A's hash map by calling bpf_cgrp_storage_get() for as many userspace tasks as possible. Steps to reproduce: - Run A; - while (true) { Run B; Destroy B; } Fix this issue by passing its busy counter to the free procedure so it can be properly incremented before storage/smap locking. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Abel Wu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241221061018.37717-1-wuyun.abel@bytedance.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_cgrp_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d5dc65bb17550..54ff2a85d4c02 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -153,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ -- GitLab From bdc35f164b0f60480b2f5e098bb8f3c0cea05cd2 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 31 Jan 2025 00:27:44 +0900 Subject: [PATCH 073/989] tomoyo: use better patterns for procfs in learning mode Commit 08ae2487b202 ("tomoyo: automatically use patterns for several situations in learning mode") replaced only $PID part of procfs pathname with \$ pattern. But it turned out that we need to also replace $TID part and $FD part to make this functionality useful for e.g. /bin/lsof . Signed-off-by: Tetsuo Handa --- security/tomoyo/common.c | 145 ++++++++++++++++++++++++++++++--------- 1 file changed, 112 insertions(+), 33 deletions(-) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index d9fa696321475..0f78898bce09b 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -1980,6 +1980,114 @@ static int tomoyo_truncate(char *str) return strlen(start) + 1; } +/** + * tomoyo_numscan - sscanf() which stores the length of a decimal integer value. + * + * @str: String to scan. + * @head: Leading string that must start with. + * @width: Pointer to "int" for storing length of a decimal integer value after @head. + * @tail: Optional character that must match after a decimal integer value. + * + * Returns whether @str starts with @head and a decimal value follows @head. + */ +static bool tomoyo_numscan(const char *str, const char *head, int *width, const char tail) +{ + const char *cp; + const int n = strlen(head); + + if (!strncmp(str, head, n)) { + cp = str + n; + while (*cp && *cp >= '0' && *cp <= '9') + cp++; + if (*cp == tail || !tail) { + *width = cp - (str + n); + return *width != 0; + } + } + *width = 0; + return 0; +} + +/** + * tomoyo_patternize_path - Make patterns for file path. Used by learning mode. + * + * @buffer: Destination buffer. + * @len: Size of @buffer. + * @entry: Original line. + * + * Returns nothing. + */ +static void tomoyo_patternize_path(char *buffer, const int len, char *entry) +{ + int width; + char *cp = entry; + + /* Nothing to do if this line is not for "file" related entry. */ + if (strncmp(entry, "file ", 5)) + goto flush; + /* + * Nothing to do if there is no colon in this line, for this rewriting + * applies to only filesystems where numeric values in the path are volatile. + */ + cp = strchr(entry + 5, ':'); + if (!cp) { + cp = entry; + goto flush; + } + /* Flush e.g. "file ioctl" part. */ + while (*cp != ' ') + cp--; + *cp++ = '\0'; + tomoyo_addprintf(buffer, len, "%s ", entry); + /* e.g. file ioctl pipe:[$INO] $CMD */ + if (tomoyo_numscan(cp, "pipe:[", &width, ']')) { + cp += width + 7; + tomoyo_addprintf(buffer, len, "pipe:[\\$]"); + goto flush; + } + /* e.g. file ioctl socket:[$INO] $CMD */ + if (tomoyo_numscan(cp, "socket:[", &width, ']')) { + cp += width + 9; + tomoyo_addprintf(buffer, len, "socket:[\\$]"); + goto flush; + } + if (!strncmp(cp, "proc:/self", 10)) { + /* e.g. file read proc:/self/task/$TID/fdinfo/$FD */ + cp += 10; + tomoyo_addprintf(buffer, len, "proc:/self"); + } else if (tomoyo_numscan(cp, "proc:/", &width, 0)) { + /* e.g. file read proc:/$PID/task/$TID/fdinfo/$FD */ + /* + * Don't patternize $PID part if $PID == 1, for several + * programs access only files in /proc/1/ directory. + */ + cp += width + 6; + if (width == 1 && *(cp - 1) == '1') + tomoyo_addprintf(buffer, len, "proc:/1"); + else + tomoyo_addprintf(buffer, len, "proc:/\\$"); + } else { + goto flush; + } + /* Patternize $TID part if "/task/" follows. */ + if (tomoyo_numscan(cp, "/task/", &width, 0)) { + cp += width + 6; + tomoyo_addprintf(buffer, len, "/task/\\$"); + } + /* Patternize $FD part if "/fd/" or "/fdinfo/" follows. */ + if (tomoyo_numscan(cp, "/fd/", &width, 0)) { + cp += width + 4; + tomoyo_addprintf(buffer, len, "/fd/\\$"); + } else if (tomoyo_numscan(cp, "/fdinfo/", &width, 0)) { + cp += width + 8; + tomoyo_addprintf(buffer, len, "/fdinfo/\\$"); + } +flush: + /* Flush remaining part if any. */ + if (*cp) + tomoyo_addprintf(buffer, len, "%s", cp); +} + /** * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode. * @@ -2003,7 +2111,8 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) if (!cp) return; *cp++ = '\0'; - len = strlen(cp) + 1; + /* Reserve some space for potentially using patterns. */ + len = strlen(cp) + 16; /* strstr() will return NULL if ordering is wrong. */ if (*cp == 'f') { argv0 = strstr(header, " argv[]={ \""); @@ -2020,40 +2129,10 @@ static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header) if (symlink) len += tomoyo_truncate(symlink + 1) + 1; } - buffer = kmalloc(len, GFP_NOFS); + buffer = kmalloc(len, GFP_NOFS | __GFP_ZERO); if (!buffer) return; - snprintf(buffer, len - 1, "%s", cp); - if (*cp == 'f' && strchr(buffer, ':')) { - /* Automatically replace 2 or more digits with \$ pattern. */ - char *cp2; - - /* e.g. file read proc:/$PID/stat */ - cp = strstr(buffer, " proc:/"); - if (cp && simple_strtoul(cp + 7, &cp2, 10) >= 10 && *cp2 == '/') { - *(cp + 7) = '\\'; - *(cp + 8) = '$'; - memmove(cp + 9, cp2, strlen(cp2) + 1); - goto ok; - } - /* e.g. file ioctl pipe:[$INO] $CMD */ - cp = strstr(buffer, " pipe:["); - if (cp && simple_strtoul(cp + 7, &cp2, 10) >= 10 && *cp2 == ']') { - *(cp + 7) = '\\'; - *(cp + 8) = '$'; - memmove(cp + 9, cp2, strlen(cp2) + 1); - goto ok; - } - /* e.g. file ioctl socket:[$INO] $CMD */ - cp = strstr(buffer, " socket:["); - if (cp && simple_strtoul(cp + 9, &cp2, 10) >= 10 && *cp2 == ']') { - *(cp + 9) = '\\'; - *(cp + 10) = '$'; - memmove(cp + 11, cp2, strlen(cp2) + 1); - goto ok; - } - } -ok: + tomoyo_patternize_path(buffer, len, cp); if (realpath) tomoyo_addprintf(buffer, len, " exec.%s", realpath); if (argv0) -- GitLab From ee2ab467bddfb2d7f68d996dbab94d7b88f8eaf7 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 21 Jan 2025 18:11:33 -0700 Subject: [PATCH 074/989] x86/boot: Use '-std=gnu11' to fix build with GCC 15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 15 changed the default C standard version to C23, which should not have impacted the kernel because it requests the gnu11 standard via '-std=' in the main Makefile. However, the x86 compressed boot Makefile uses its own set of KBUILD_CFLAGS without a '-std=' value (i.e., using the default), resulting in errors from the kernel's definitions of bool, true, and false in stddef.h, which are reserved keywords under C23. ./include/linux/stddef.h:11:9: error: expected identifier before ‘false’ 11 | false = 0, ./include/linux/types.h:35:33: error: two or more data types in declaration specifiers 35 | typedef _Bool bool; Set '-std=gnu11' in the x86 compressed boot Makefile to resolve the error and consistently use the same C standard version for the entire kernel. Closes: https://lore.kernel.org/4OAhbllK7x4QJGpZjkYjtBYNLd_2whHx9oFiuZcGwtVR4hIzvduultkgfAIRZI3vQpZylu7Gl929HaYFRGeMEalWCpeMzCIIhLxxRhq4U-Y=@protonmail.com/ Closes: https://lore.kernel.org/Z4467umXR2PZ0M1H@tucnak/ Reported-by: Kostadin Shishmanov Reported-by: Jakub Jelinek Signed-off-by: Nathan Chancellor Signed-off-by: Dave Hansen Reviewed-by: Ard Biesheuvel Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250121-x86-use-std-consistently-gcc-15-v1-1-8ab0acf645cb%40kernel.org --- arch/x86/boot/compressed/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f2051644de943..606c74f274593 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,6 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) +KBUILD_CFLAGS += -std=gnu11 KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING -- GitLab From bb2784d9ab49587ba4fbff37a319fff2924db289 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Thu, 30 Jan 2025 19:26:58 +0000 Subject: [PATCH 075/989] jiffies: Cast to unsigned long in secs_to_jiffies() conversion While converting users of msecs_to_jiffies(), lkp reported that some range checks would always be true because of the mismatch between the implied int value of secs_to_jiffies() vs the unsigned long return value of the msecs_to_jiffies() calls it was replacing. Fix this by casting the secs_to_jiffies() input value to unsigned long. Fixes: b35108a51cf7ba ("jiffies: Define secs_to_jiffies()") Reported-by: kernel test robot Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250130192701.99626-1-eahariha@linux.microsoft.com Closes: https://lore.kernel.org/oe-kbuild-all/202501301334.NB6NszQR-lkp@intel.com/ --- include/linux/jiffies.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index ed945f42e064a..0ea8c9887429f 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -537,7 +537,7 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) * * Return: jiffies value */ -#define secs_to_jiffies(_secs) ((_secs) * HZ) +#define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) -- GitLab From 9065ce69754dece78606c8bbb3821449272e56bf Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 29 Jan 2025 17:59:44 +0000 Subject: [PATCH 076/989] sched/debug: Provide slice length for fair tasks Since commit: 857b158dc5e8 ("sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion") ... we have the userspace per-task tunable slice length, which is a key parameter that is otherwise difficult to obtain, so provide it in /proc/$PID/sched. [ mingo: Clarified the changelog. ] Signed-off-by: Christian Loehle Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/453349b1-1637-42f5-a7b2-2385392b5956@arm.com --- kernel/sched/debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a1be00a988bf6..5b32d3cc393bf 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1265,6 +1265,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } #ifdef CONFIG_SCHED_CLASS_EXT __PS("ext.enabled", task_on_scx(p)); -- GitLab From 5f230f41fdd9e799f43a699348dc572bca7159aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Jan 2025 16:43:41 +0100 Subject: [PATCH 077/989] KVM: s390: vsie: fix some corner-cases when grabbing vsie pages We try to reuse the same vsie page when re-executing the vsie with a given SCB address. The result is that we use the same shadow SCB -- residing in the vsie page -- and can avoid flushing the TLB when re-running the vsie on a CPU. So, when we allocate a fresh vsie page, or when we reuse a vsie page for a different SCB address -- reusing the shadow SCB in different context -- we set ihcpu=0xffff to trigger the flush. However, after we looked up the SCB address in the radix tree, but before we grabbed the vsie page by raising the refcount to 2, someone could reuse the vsie page for a different SCB address, adjusting page->index and the radix tree. In that case, we would be reusing the vsie page with a wrong page->index. Another corner case is that we might set the SCB address for a vsie page, but fail the insertion into the radix tree. Whoever would reuse that page would remove the corresponding radix tree entry -- which might now be a valid entry pointing at another page, resulting in the wrong vsie page getting removed from the radix tree. Let's handle such races better, by validating that the SCB address of a vsie page didn't change after we grabbed it (not reuse for a different SCB; the alternative would be performing another tree lookup), and by setting the SCB address to invalid until the insertion in the tree succeeded (SCB addresses are aligned to 512, so ULONG_MAX is invalid). These scenarios are rare, the effects a bit unclear, and these issues were only found by code inspection. Let's CC stable to be safe. Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Schlameuss Tested-by: Christoph Schlameuss Message-ID: <20250107154344.1003072-2-david@redhat.com> Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/vsie.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a687695d8f68e..513e608567ccc 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1362,8 +1362,14 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); rcu_read_unlock(); if (page) { - if (page_ref_inc_return(page) == 2) - return page_to_virt(page); + if (page_ref_inc_return(page) == 2) { + if (page->index == addr) + return page_to_virt(page); + /* + * We raced with someone reusing + putting this vsie + * page before we grabbed it. + */ + } page_ref_dec(page); } @@ -1393,15 +1399,20 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) kvm->arch.vsie.next++; kvm->arch.vsie.next %= nr_vcpus; } - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + if (page->index != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + page->index >> 9); } - page->index = addr; - /* double use of the same address */ + /* Mark it as invalid until it resides in the tree. */ + page->index = ULONG_MAX; + + /* Double use of the same address or allocation failure. */ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { page_ref_dec(page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; } + page->index = addr; mutex_unlock(&kvm->arch.vsie.mutex); vsie_page = page_to_virt(page); @@ -1496,7 +1507,9 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) vsie_page = page_to_virt(page); release_gmap_shadow(vsie_page); /* free the radix tree entry */ - radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + if (page->index != ULONG_MAX) + radix_tree_delete(&kvm->arch.vsie.addr_to_page, + page->index >> 9); __free_page(page); } kvm->arch.vsie.page_count = 0; -- GitLab From c5f64c98a1f7e4ca0e55b441620473389b8c7a72 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Jan 2025 16:43:42 +0100 Subject: [PATCH 078/989] KVM: s390: vsie: stop using page->index Let's stop using page->index, and instead use a field inside "struct vsie_page" to hold that value. We have plenty of space left in there. This is one part of stopping using "struct page" when working with vsie pages. We place the "page_to_virt(page)" strategically, so the next cleanups requires less churn. Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Schlameuss Tested-by: Christoph Schlameuss Message-ID: <20250107154344.1003072-3-david@redhat.com> Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/vsie.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 513e608567ccc..3874a1b49dd54 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -46,7 +46,13 @@ struct vsie_page { gpa_t gvrd_gpa; /* 0x0240 */ gpa_t riccbd_gpa; /* 0x0248 */ gpa_t sdnx_gpa; /* 0x0250 */ - __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */ + /* + * guest address of the original SCB. Remains set for free vsie + * pages, so we can properly look them up in our addr_to_page + * radix tree. + */ + gpa_t scb_gpa; /* 0x0258 */ + __u8 reserved[0x0700 - 0x0260]; /* 0x0260 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; @@ -1362,9 +1368,10 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); rcu_read_unlock(); if (page) { + vsie_page = page_to_virt(page); if (page_ref_inc_return(page) == 2) { - if (page->index == addr) - return page_to_virt(page); + if (vsie_page->scb_gpa == addr) + return vsie_page; /* * We raced with someone reusing + putting this vsie * page before we grabbed it. @@ -1386,6 +1393,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); } + vsie_page = page_to_virt(page); page_ref_inc(page); kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; kvm->arch.vsie.page_count++; @@ -1393,18 +1401,19 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) /* reuse an existing entry that belongs to nobody */ while (true) { page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + vsie_page = page_to_virt(page); if (page_ref_inc_return(page) == 2) break; page_ref_dec(page); kvm->arch.vsie.next++; kvm->arch.vsie.next %= nr_vcpus; } - if (page->index != ULONG_MAX) + if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, - page->index >> 9); + vsie_page->scb_gpa >> 9); } /* Mark it as invalid until it resides in the tree. */ - page->index = ULONG_MAX; + vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { @@ -1412,10 +1421,9 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_unlock(&kvm->arch.vsie.mutex); return NULL; } - page->index = addr; + vsie_page->scb_gpa = addr; mutex_unlock(&kvm->arch.vsie.mutex); - vsie_page = page_to_virt(page); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); release_gmap_shadow(vsie_page); vsie_page->fault_addr = 0; @@ -1507,9 +1515,9 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) vsie_page = page_to_virt(page); release_gmap_shadow(vsie_page); /* free the radix tree entry */ - if (page->index != ULONG_MAX) + if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, - page->index >> 9); + vsie_page->scb_gpa >> 9); __free_page(page); } kvm->arch.vsie.page_count = 0; -- GitLab From 905f5ce0835c938c501237d9371cdbc91d8f7e02 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Jan 2025 16:43:43 +0100 Subject: [PATCH 079/989] KVM: s390: vsie: stop messing with page refcount Let's stop messing with the page refcount, and use a flag that is set / cleared atomically to remember whether a vsie page is currently in use. Note that we could use a page flag, or a lower bit of the scb_gpa. Let's keep it simple for now, we have sufficient space. While at it, stop passing "struct kvm *" to put_vsie_page(), it's unused. Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Schlameuss Tested-by: Christoph Schlameuss Message-ID: <20250107154344.1003072-4-david@redhat.com> Signed-off-by: Claudio Imbrenda --- arch/s390/kvm/vsie.c | 46 +++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 3874a1b49dd54..424f80f5f6b2d 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -23,6 +23,10 @@ #include "kvm-s390.h" #include "gaccess.h" +enum vsie_page_flags { + VSIE_PAGE_IN_USE = 0, +}; + struct vsie_page { struct kvm_s390_sie_block scb_s; /* 0x0000 */ /* @@ -52,7 +56,12 @@ struct vsie_page { * radix tree. */ gpa_t scb_gpa; /* 0x0258 */ - __u8 reserved[0x0700 - 0x0260]; /* 0x0260 */ + /* + * Flags: must be set/cleared atomically after the vsie page can be + * looked up by other CPUs. + */ + unsigned long flags; /* 0x0260 */ + __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; @@ -1351,6 +1360,20 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return rc; } +/* Try getting a given vsie page, returning "true" on success. */ +static inline bool try_get_vsie_page(struct vsie_page *vsie_page) +{ + if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags)) + return false; + return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + +/* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */ +static void put_vsie_page(struct vsie_page *vsie_page) +{ + clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); +} + /* * Get or create a vsie page for a scb address. * @@ -1369,15 +1392,15 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) rcu_read_unlock(); if (page) { vsie_page = page_to_virt(page); - if (page_ref_inc_return(page) == 2) { + if (try_get_vsie_page(vsie_page)) { if (vsie_page->scb_gpa == addr) return vsie_page; /* * We raced with someone reusing + putting this vsie * page before we grabbed it. */ + put_vsie_page(vsie_page); } - page_ref_dec(page); } /* @@ -1394,7 +1417,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) return ERR_PTR(-ENOMEM); } vsie_page = page_to_virt(page); - page_ref_inc(page); + __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; kvm->arch.vsie.page_count++; } else { @@ -1402,9 +1425,8 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) while (true) { page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; vsie_page = page_to_virt(page); - if (page_ref_inc_return(page) == 2) + if (try_get_vsie_page(vsie_page)) break; - page_ref_dec(page); kvm->arch.vsie.next++; kvm->arch.vsie.next %= nr_vcpus; } @@ -1417,7 +1439,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) /* Double use of the same address or allocation failure. */ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { - page_ref_dec(page); + put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; } @@ -1431,14 +1453,6 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) return vsie_page; } -/* put a vsie page acquired via get_vsie_page */ -static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) -{ - struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); - - page_ref_dec(page); -} - int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) { struct vsie_page *vsie_page; @@ -1489,7 +1503,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) out_unpin_scb: unpin_scb(vcpu, vsie_page, scb_addr); out_put: - put_vsie_page(vcpu->kvm, vsie_page); + put_vsie_page(vsie_page); return rc < 0 ? rc : 0; } -- GitLab From 4514eda4c1dbd0b7062e06c769d2ceafd25c9284 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Jan 2025 16:43:44 +0100 Subject: [PATCH 080/989] KVM: s390: vsie: stop using "struct page" for vsie page Now that we no longer use page->index and the page refcount explicitly, let's avoid messing with "struct page" completely. Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Reviewed-by: Christoph Schlameuss Tested-by: Christoph Schlameuss Message-ID: <20250107154344.1003072-5-david@redhat.com> Signed-off-by: Claudio Imbrenda --- arch/s390/include/asm/kvm_host.h | 4 +++- arch/s390/kvm/vsie.c | 31 ++++++++++++------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 97c7c81275434..4581388411b71 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -931,12 +931,14 @@ struct sie_page2 { u8 reserved928[0x1000 - 0x928]; /* 0x0928 */ }; +struct vsie_page; + struct kvm_s390_vsie { struct mutex mutex; struct radix_tree_root addr_to_page; int page_count; int next; - struct page *pages[KVM_MAX_VCPUS]; + struct vsie_page *pages[KVM_MAX_VCPUS]; }; struct kvm_s390_gisa_iam { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 424f80f5f6b2d..a0398ff85d00b 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -599,7 +599,6 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct vsie_page *cur; unsigned long prefix; - struct page *page; int i; if (!gmap_is_shadow(gmap)) @@ -609,10 +608,9 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, * therefore we can safely reference them all the time. */ for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = READ_ONCE(kvm->arch.vsie.pages[i]); - if (!page) + cur = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!cur) continue; - cur = page_to_virt(page); if (READ_ONCE(cur->gmap) != gmap) continue; prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; @@ -1384,14 +1382,12 @@ static void put_vsie_page(struct vsie_page *vsie_page) static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) { struct vsie_page *vsie_page; - struct page *page; int nr_vcpus; rcu_read_lock(); - page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); rcu_read_unlock(); - if (page) { - vsie_page = page_to_virt(page); + if (vsie_page) { if (try_get_vsie_page(vsie_page)) { if (vsie_page->scb_gpa == addr) return vsie_page; @@ -1411,20 +1407,18 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); - if (!page) { + vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); + if (!vsie_page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); } - vsie_page = page_to_virt(page); __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags); - kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page; kvm->arch.vsie.page_count++; } else { /* reuse an existing entry that belongs to nobody */ while (true) { - page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; - vsie_page = page_to_virt(page); + vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; if (try_get_vsie_page(vsie_page)) break; kvm->arch.vsie.next++; @@ -1438,7 +1432,8 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page->scb_gpa = ULONG_MAX; /* Double use of the same address or allocation failure. */ - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, + vsie_page)) { put_vsie_page(vsie_page); mutex_unlock(&kvm->arch.vsie.mutex); return NULL; @@ -1519,20 +1514,18 @@ void kvm_s390_vsie_init(struct kvm *kvm) void kvm_s390_vsie_destroy(struct kvm *kvm) { struct vsie_page *vsie_page; - struct page *page; int i; mutex_lock(&kvm->arch.vsie.mutex); for (i = 0; i < kvm->arch.vsie.page_count; i++) { - page = kvm->arch.vsie.pages[i]; + vsie_page = kvm->arch.vsie.pages[i]; kvm->arch.vsie.pages[i] = NULL; - vsie_page = page_to_virt(page); release_gmap_shadow(vsie_page); /* free the radix tree entry */ if (vsie_page->scb_gpa != ULONG_MAX) radix_tree_delete(&kvm->arch.vsie.addr_to_page, vsie_page->scb_gpa >> 9); - __free_page(page); + free_page((unsigned long)vsie_page); } kvm->arch.vsie.page_count = 0; mutex_unlock(&kvm->arch.vsie.mutex); -- GitLab From 66119f8ce135de664cb2fb88d9aaa322d7451a1f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 23 Jan 2025 15:46:13 +0100 Subject: [PATCH 081/989] KVM: Do not restrict the size of KVM-internal memory regions Exempt KVM-internal memslots from the KVM_MEM_MAX_NR_PAGES restriction, as the limit on the number of pages exists purely to play nice with dirty bitmap operations, which use 32-bit values to index the bitmaps, and dirty logging isn't supported for KVM-internal memslots. Link: https://lore.kernel.org/all/20240802205003.353672-6-seanjc@google.com Signed-off-by: Sean Christopherson Reviewed-by: Christoph Schlameuss Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20250123144627.312456-2-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-2-imbrenda@linux.ibm.com> --- virt/kvm/kvm_main.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index faf10671eed2a..3f04cd5e3a8cf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1971,7 +1971,15 @@ static int kvm_set_memory_region(struct kvm *kvm, return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; - if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + + /* + * The size of userspace-defined memory regions is restricted in order + * to play nice with dirty bitmap operations, which are indexed with an + * "unsigned int". KVM's internal memory regions don't support dirty + * logging, and so are exempt. + */ + if (id < KVM_USER_MEM_SLOTS && + (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL; slots = __kvm_memslots(kvm, as_id); -- GitLab From decff09adbeba4b75a1982b1dc3991761914e2df Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:14 +0100 Subject: [PATCH 082/989] KVM: s390: wrapper for KVM_BUG Wrap the call to KVM_BUG; this reduces code duplication and improves readability. Reviewed-by: Christian Borntraeger Reviewed-by: Christoph Schlameuss Reviewed-by: Steffen Eiden Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20250123144627.312456-3-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-3-imbrenda@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d8080c27d45bd..ecbdd7d41230a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4766,6 +4766,13 @@ static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &pgm_info); } +static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) +{ + KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, + "Unexpected program interrupt 0x%x, TEID 0x%016lx", + current->thread.gmap_int_code, current->thread.gmap_teid.val); +} + static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { unsigned int flags = 0; @@ -4781,9 +4788,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) vcpu->stat.exit_null++; break; case PGM_NON_SECURE_STORAGE_ACCESS: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); + kvm_s390_assert_primary_as(vcpu); /* * This is normal operation; a page belonging to a protected * guest has not been imported yet. Try to import the page into @@ -4794,9 +4799,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) break; case PGM_SECURE_STORAGE_ACCESS: case PGM_SECURE_STORAGE_VIOLATION: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); + kvm_s390_assert_primary_as(vcpu); /* * This can happen after a reboot with asynchronous teardown; * the new guest (normal or protected) will run on top of the @@ -4825,9 +4828,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_FIRST_TRANS: case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: - KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm, - "Unexpected program interrupt 0x%x, TEID 0x%016lx", - current->thread.gmap_int_code, current->thread.gmap_teid.val); + kvm_s390_assert_primary_as(vcpu); if (vcpu->arch.gmap->pfault_enabled) { rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT); if (rc == -EFAULT) -- GitLab From 413c98f24c63b3b8aff202fce6f01e8950730511 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:15 +0100 Subject: [PATCH 083/989] KVM: s390: fake memslot for ucontrol VMs Create a fake memslot for ucontrol VMs. The fake memslot identity-maps userspace. Now memslots will always be present, and ucontrol is not a special case anymore. Suggested-by: Sean Christopherson Reviewed-by: David Hildenbrand Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20250123144627.312456-4-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-4-imbrenda@linux.ibm.com> --- Documentation/virt/kvm/api.rst | 2 +- arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/kvm-s390.c | 17 ++++++++++++++++- arch/s390/kvm/kvm-s390.h | 2 ++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0d1c3a820ce6e..2b52eb77e29cb 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -1419,7 +1419,7 @@ fetch) is injected in the guest. S390: ^^^^^ -Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set. +Returns -EINVAL or -EEXIST if the VM has the KVM_VM_S390_UCONTROL flag set. Returns -EINVAL if called on a protected VM. 4.36 KVM_SET_TSS_ADDR diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 4581388411b71..9a367866cab0e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -30,6 +30,8 @@ #define KVM_S390_ESCA_CPU_SLOTS 248 #define KVM_MAX_VCPUS 255 +#define KVM_INTERNAL_MEM_SLOTS 1 + /* * These seem to be used for allocating ->chip in the routing table, which we * don't use. 1 is as small as we can get to reduce the needed memory. If we diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ecbdd7d41230a..fc44002a7b04c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3428,8 +3428,20 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) VM_EVENT(kvm, 3, "vm created with type %lu", type); if (type & KVM_VM_S390_UCONTROL) { + struct kvm_userspace_memory_region2 fake_memslot = { + .slot = KVM_S390_UCONTROL_MEMSLOT, + .guest_phys_addr = 0, + .userspace_addr = 0, + .memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE), + .flags = 0, + }; + kvm->arch.gmap = NULL; kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; + /* one flat fake memslot covering the whole address-space */ + mutex_lock(&kvm->slots_lock); + KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); + mutex_unlock(&kvm->slots_lock); } else { if (sclp.hamax == U64_MAX) kvm->arch.mem_limit = TASK_SIZE_MAX; @@ -5854,7 +5866,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, { gpa_t size; - if (kvm_is_ucontrol(kvm)) + if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS) return -EINVAL; /* When we are protected, we should not change the memory slots */ @@ -5906,6 +5918,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, { int rc = 0; + if (kvm_is_ucontrol(kvm)) + return; + switch (change) { case KVM_MR_DELETE: rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 597d7a71deebe..30736ac16f848 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -20,6 +20,8 @@ #include #include +#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) + static inline void kvm_s390_fpu_store(struct kvm_run *run) { fpu_stfpc(&run->s.regs.fpc); -- GitLab From 63e71519891024b622d00c486c4d0348c44ca911 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:16 +0100 Subject: [PATCH 084/989] KVM: s390: selftests: fix ucontrol memory region test With the latest patch, attempting to create a memslot from userspace will result in an EEXIST error for UCONTROL VMs, instead of EINVAL, since the new memslot will collide with the internal memslot. There is no simple way to bring back the previous behaviour. This is not a problem, but the test needs to be fixed accordingly. Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-5-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-5-imbrenda@linux.ibm.com> --- tools/testing/selftests/kvm/s390/ucontrol_test.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/s390/ucontrol_test.c b/tools/testing/selftests/kvm/s390/ucontrol_test.c index 135ee22856cf1..22ce9219620ce 100644 --- a/tools/testing/selftests/kvm/s390/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390/ucontrol_test.c @@ -459,10 +459,14 @@ TEST_F(uc_kvm, uc_no_user_region) }; ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION, ®ion)); - ASSERT_EQ(EINVAL, errno); + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION", + strerror(errno), errno); ASSERT_EQ(-1, ioctl(self->vm_fd, KVM_SET_USER_MEMORY_REGION2, ®ion2)); - ASSERT_EQ(EINVAL, errno); + ASSERT_TRUE(errno == EEXIST || errno == EINVAL) + TH_LOG("errno %s (%i) not expected for ioctl KVM_SET_USER_MEMORY_REGION2", + strerror(errno), errno); } TEST_F(uc_kvm, uc_map_unmap) -- GitLab From 5cbe24350b7d8ef6d466a37d56b07ae643c622ca Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:17 +0100 Subject: [PATCH 085/989] KVM: s390: move pv gmap functions into kvm Move gmap related functions from kernel/uv into kvm. Create a new file to collect gmap-related functions. Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss [fixed unpack_one(), thanks mhartmay@linux.ibm.com] Link: https://lore.kernel.org/r/20250123144627.312456-6-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-6-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 1 + arch/s390/include/asm/uv.h | 6 +- arch/s390/kernel/uv.c | 292 ++++------------------------------- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/gmap.c | 212 +++++++++++++++++++++++++ arch/s390/kvm/gmap.h | 17 ++ arch/s390/kvm/intercept.c | 3 +- arch/s390/kvm/kvm-s390.c | 1 + arch/s390/kvm/pv.c | 21 +++ arch/s390/mm/gmap.c | 28 ++++ 10 files changed, 315 insertions(+), 268 deletions(-) create mode 100644 arch/s390/kvm/gmap.c create mode 100644 arch/s390/kvm/gmap.h diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 13f51a6a5bb1b..3e66f53fe3ccc 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -149,6 +149,7 @@ int s390_replace_asce(struct gmap *gmap); void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool interruptible); +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split); /** * s390_uv_destroy_range - Destroy a range of pages in the given mm. diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index dc332609f2c3f..b11f5b6d0bd14 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -628,12 +628,12 @@ static inline int is_prot_virt_host(void) } int uv_pin_shared(unsigned long paddr); -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); int uv_destroy_folio(struct folio *folio); int uv_destroy_pte(pte_t pte); int uv_convert_from_secure_pte(pte_t pte); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); +int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); +int uv_convert_from_secure(unsigned long paddr); +int uv_convert_from_secure_folio(struct folio *folio); void setup_uv(void); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6f9654a191ad9..9f05df2da2f73 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -19,19 +19,6 @@ #include #include -#if !IS_ENABLED(CONFIG_KVM) -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - return 0; -} - -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) -{ - return 0; -} -#endif - /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ int __bootdata_preserved(prot_virt_guest); EXPORT_SYMBOL(prot_virt_guest); @@ -159,6 +146,7 @@ int uv_destroy_folio(struct folio *folio) folio_put(folio); return rc; } +EXPORT_SYMBOL(uv_destroy_folio); /* * The present PTE still indirectly holds a folio reference through the mapping. @@ -175,7 +163,7 @@ int uv_destroy_pte(pte_t pte) * * @paddr: Absolute host address of page to be exported */ -static int uv_convert_from_secure(unsigned long paddr) +int uv_convert_from_secure(unsigned long paddr) { struct uv_cb_cfs uvcb = { .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, @@ -187,11 +175,12 @@ static int uv_convert_from_secure(unsigned long paddr) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure); /* * The caller must already hold a reference to the folio. */ -static int uv_convert_from_secure_folio(struct folio *folio) +int uv_convert_from_secure_folio(struct folio *folio) { int rc; @@ -206,6 +195,7 @@ static int uv_convert_from_secure_folio(struct folio *folio) folio_put(folio); return rc; } +EXPORT_SYMBOL_GPL(uv_convert_from_secure_folio); /* * The present PTE still indirectly holds a folio reference through the mapping. @@ -237,13 +227,33 @@ static int expected_folio_refs(struct folio *folio) return res; } -static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) +/** + * make_folio_secure() - make a folio secure + * @folio: the folio to make secure + * @uvcb: the uvcb that describes the UVC to be used + * + * The folio @folio will be made secure if possible, @uvcb will be passed + * as-is to the UVC. + * + * Return: 0 on success; + * -EBUSY if the folio is in writeback or has too many references; + * -E2BIG if the folio is large; + * -EAGAIN if the UVC needs to be attempted again; + * -ENXIO if the address is not mapped; + * -EINVAL if the UVC failed for other reasons. + * + * Context: The caller must hold exactly one extra reference on the folio + * (it's the same logic as split_folio()) + */ +int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) { int expected, cc = 0; + if (folio_test_large(folio)) + return -E2BIG; if (folio_test_writeback(folio)) - return -EAGAIN; - expected = expected_folio_refs(folio); + return -EBUSY; + expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; set_bit(PG_arch_1, &folio->flags); @@ -267,251 +277,7 @@ static int make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) return -EAGAIN; return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } - -/** - * should_export_before_import - Determine whether an export is needed - * before an import-like operation - * @uvcb: the Ultravisor control block of the UVC to be performed - * @mm: the mm of the process - * - * Returns whether an export is needed before every import-like operation. - * This is needed for shared pages, which don't trigger a secure storage - * exception when accessed from a different guest. - * - * Although considered as one, the Unpin Page UVC is not an actual import, - * so it is not affected. - * - * No export is needed also when there is only one protected VM, because the - * page cannot belong to the wrong VM in that case (there is no "other VM" - * it can belong to). - * - * Return: true if an export is needed before every import, otherwise false. - */ -static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) -{ - /* - * The misc feature indicates, among other things, that importing a - * shared page from a different protected VM will automatically also - * transfer its ownership. - */ - if (uv_has_feature(BIT_UV_FEAT_MISC)) - return false; - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) - return false; - return atomic_read(&mm->context.protected_count) > 1; -} - -/* - * Drain LRU caches: the local one on first invocation and the ones of all - * CPUs on successive invocations. Returns "true" on the first invocation. - */ -static bool drain_lru(bool *drain_lru_called) -{ - /* - * If we have tried a local drain and the folio refcount - * still does not match our expected safe value, try with a - * system wide drain. This is needed if the pagevecs holding - * the page are on a different CPU. - */ - if (*drain_lru_called) { - lru_add_drain_all(); - /* We give up here, don't retry immediately. */ - return false; - } - /* - * We are here if the folio refcount does not match the - * expected safe value. The main culprits are usually - * pagevecs. With lru_add_drain() we drain the pagevecs - * on the local CPU so that hopefully the refcount will - * reach the expected safe value. - */ - lru_add_drain(); - *drain_lru_called = true; - /* The caller should try again immediately */ - return true; -} - -/* - * Requests the Ultravisor to make a page accessible to a guest. - * If it's brought in the first time, it will be cleared. If - * it has been exported before, it will be decrypted and integrity - * checked. - */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) -{ - struct vm_area_struct *vma; - bool drain_lru_called = false; - spinlock_t *ptelock; - unsigned long uaddr; - struct folio *folio; - pte_t *ptep; - int rc; - -again: - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Secure pages cannot be huge and userspace should not combine both. - * In case userspace does it anyway this will result in an -EFAULT for - * the unpack. The guest is thus never reaching secure mode. If - * userspace is playing dirty tricky with mapping huge pages later - * on this will result in a segmentation fault. - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = -ENXIO; - ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - if (!ptep) - goto out; - if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { - folio = page_folio(pte_page(*ptep)); - rc = -EAGAIN; - if (folio_test_large(folio)) { - rc = -E2BIG; - } else if (folio_trylock(folio)) { - if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(PFN_PHYS(folio_pfn(folio))); - rc = make_folio_secure(folio, uvcb); - folio_unlock(folio); - } - - /* - * Once we drop the PTL, the folio may get unmapped and - * freed immediately. We need a temporary reference. - */ - if (rc == -EAGAIN || rc == -E2BIG) - folio_get(folio); - } - pte_unmap_unlock(ptep, ptelock); -out: - mmap_read_unlock(gmap->mm); - - switch (rc) { - case -E2BIG: - folio_lock(folio); - rc = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - - switch (rc) { - case 0: - /* Splitting succeeded, try again immediately. */ - goto again; - case -EAGAIN: - /* Additional folio references. */ - if (drain_lru(&drain_lru_called)) - goto again; - return -EAGAIN; - case -EBUSY: - /* Unexpected race. */ - return -EAGAIN; - } - WARN_ON_ONCE(1); - return -ENXIO; - case -EAGAIN: - /* - * If we are here because the UVC returned busy or partial - * completion, this is just a useless check, but it is safe. - */ - folio_wait_writeback(folio); - folio_put(folio); - return -EAGAIN; - case -EBUSY: - /* Additional folio references. */ - if (drain_lru(&drain_lru_called)) - goto again; - return -EAGAIN; - case -ENXIO: - if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) - return -EFAULT; - return -EAGAIN; - } - return rc; -} -EXPORT_SYMBOL_GPL(gmap_make_secure); - -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) -{ - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; - - return gmap_make_secure(gmap, gaddr, &uvcb); -} -EXPORT_SYMBOL_GPL(gmap_convert_to_secure); - -/** - * gmap_destroy_page - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy - * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. - */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) -{ - struct vm_area_struct *vma; - struct folio_walk fw; - unsigned long uaddr; - struct folio *folio; - int rc; - - rc = -EFAULT; - mmap_read_lock(gmap->mm); - - uaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(uaddr)) - goto out; - vma = vma_lookup(gmap->mm, uaddr); - if (!vma) - goto out; - /* - * Huge pages should not be able to become secure - */ - if (is_vm_hugetlb_page(vma)) - goto out; - - rc = 0; - folio = folio_walk_start(&fw, vma, uaddr, 0); - if (!folio) - goto out; - /* - * See gmap_make_secure(): large folios cannot be secure. Small - * folio implies FW_LEVEL_PTE. - */ - if (folio_test_large(folio) || !pte_write(fw.pte)) - goto out_walk_end; - rc = uv_destroy_folio(folio); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ - if (rc) - rc = uv_convert_from_secure_folio(folio); -out_walk_end: - folio_walk_end(&fw, vma); -out: - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_destroy_page); +EXPORT_SYMBOL_GPL(make_folio_secure); /* * To be called with the folio locked or with an extra reference! This will diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 02217fb4ae10f..d972dea657fd1 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c new file mode 100644 index 0000000000000..02adf151d4de4 --- /dev/null +++ b/arch/s390/kvm/gmap.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda + * Martin Schwidefsky + * David Hildenbrand + * Janosch Frank + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "gmap.h" + +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + /* + * The misc feature indicates, among other things, that importing a + * shared page from a different protected VM will automatically also + * transfer its ownership. + */ + if (uv_has_feature(BIT_UV_FEAT_MISC)) + return false; + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + +static int __gmap_make_secure(struct gmap *gmap, struct page *page, void *uvcb) +{ + struct folio *folio = page_folio(page); + int rc; + + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. + * If userspace plays dirty tricks and decides to map huge pages at a + * later point in time, it will receive a segmentation fault or + * KVM_RUN will return -EFAULT. + */ + if (folio_test_hugetlb(folio)) + return -EFAULT; + if (folio_test_large(folio)) { + mmap_read_unlock(gmap->mm); + rc = kvm_s390_wiggle_split_folio(gmap->mm, folio, true); + mmap_read_lock(gmap->mm); + if (rc) + return rc; + folio = page_folio(page); + } + + if (!folio_trylock(folio)) + return -EAGAIN; + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(folio_to_phys(folio)); + rc = make_folio_secure(folio, uvcb); + folio_unlock(folio); + + /* + * In theory a race is possible and the folio might have become + * large again before the folio_trylock() above. In that case, no + * action is performed and -EAGAIN is returned; the callers will + * have to try again later. + * In most cases this implies running the VM again, getting the same + * exception again, and make another attempt in this function. + * This is expected to happen extremely rarely. + */ + if (rc == -E2BIG) + return -EAGAIN; + /* The folio has too many references, try to shake some off */ + if (rc == -EBUSY) { + mmap_read_unlock(gmap->mm); + kvm_s390_wiggle_split_folio(gmap->mm, folio, false); + mmap_read_lock(gmap->mm); + return -EAGAIN; + } + + return rc; +} + +/** + * gmap_make_secure() - make one guest page secure + * @gmap: the guest gmap + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error (see __gmap_make_secure()). + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct kvm *kvm = gmap->private; + struct page *page; + int rc = 0; + + lockdep_assert_held(&kvm->srcu); + + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + mmap_read_lock(gmap->mm); + if (page) + rc = __gmap_make_secure(gmap, page, uvcb); + kvm_release_page_clean(page); + mmap_read_unlock(gmap->mm); + + return rc; +} + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} + +/** + * __gmap_destroy_page() - Destroy a guest page. + * @gmap: the gmap of the guest + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static int __gmap_destroy_page(struct gmap *gmap, struct page *page) +{ + struct folio *folio = page_folio(page); + int rc; + + /* + * See gmap_make_secure(): large folios cannot be secure. Small + * folio implies FW_LEVEL_PTE. + */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; +} + +/** + * gmap_destroy_page() - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(gmap->mm); + page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); + if (page) + rc = __gmap_destroy_page(gmap, page); + kvm_release_page_clean(page); + mmap_read_unlock(gmap->mm); + return rc; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h new file mode 100644 index 0000000000000..f2b52ce29be3d --- /dev/null +++ b/arch/s390/kvm/gmap.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2016, 2025 + * Author(s): Martin Schwidefsky + * Claudio Imbrenda + */ + +#ifndef ARCH_KVM_S390_GMAP_H +#define ARCH_KVM_S390_GMAP_H + +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); + +#endif diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 5bbaadf75dc64..acf10aefd08f0 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -21,6 +21,7 @@ #include "gaccess.h" #include "trace.h" #include "trace-s390.h" +#include "gmap.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -549,7 +550,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. */ - if (rc == -EINVAL) + if (rc == -EINVAL || rc == -ENXIO) return 0; /* * If we got -EAGAIN here, we simply return it. It will eventually diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index fc44002a7b04c..a25ca440760f1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -50,6 +50,7 @@ #include "kvm-s390.h" #include "gaccess.h" #include "pci.h" +#include "gmap.h" #define CREATE_TRACE_POINTS #include "trace.h" diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 75e81ba26d047..22c012aa5206b 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,6 +17,7 @@ #include #include #include "kvm-s390.h" +#include "gmap.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -638,10 +639,28 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[1] = offset, }; int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + unsigned long vmaddr; + bool unlocked; *rc = uvcb.header.rc; *rrc = uvcb.header.rrc; + if (ret == -ENXIO) { + mmap_read_lock(kvm->mm); + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(vmaddr)) { + ret = -EFAULT; + } else { + ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!ret) + ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); + } + mmap_read_unlock(kvm->mm); + if (!ret) + return -EAGAIN; + return ret; + } + if (ret && ret != -EAGAIN) KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", uvcb.gaddr, *rc, *rrc); @@ -660,6 +679,8 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", addr, size); + guard(srcu)(&kvm->srcu); + while (offset < size) { ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); if (ret == -EAGAIN) { diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 16b8a36c56de1..3e6e25119a964 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -3035,3 +3035,31 @@ int s390_replace_asce(struct gmap *gmap) return 0; } EXPORT_SYMBOL_GPL(s390_replace_asce); + +/** + * kvm_s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split + * @mm: the mm containing the folio to work on + * @folio: the folio + * @split: whether to split a large folio + * + * Context: Must be called while holding an extra reference to the folio; + * the mm lock should not be held. + */ +int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split) +{ + int rc; + + lockdep_assert_not_held(&mm->mmap_lock); + folio_wait_writeback(folio); + lru_add_drain_all(); + if (split) { + folio_lock(folio); + rc = split_folio(folio); + folio_unlock(folio); + + if (rc != -EBUSY) + return rc; + } + return -EAGAIN; +} +EXPORT_SYMBOL_GPL(kvm_s390_wiggle_split_folio); -- GitLab From 3762e905ec2e498c96464e094b7d46be98151d3b Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:18 +0100 Subject: [PATCH 086/989] KVM: s390: use __kvm_faultin_pfn() Refactor the existing page fault handling code to use __kvm_faultin_pfn(). This possible now that memslots are always present. Acked-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-7-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-7-imbrenda@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 126 ++++++++++++++++++++++++++++++--------- arch/s390/kvm/kvm-s390.h | 6 ++ arch/s390/mm/gmap.c | 1 + 3 files changed, 106 insertions(+), 27 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a25ca440760f1..70c98bf127541 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4786,11 +4786,104 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu) current->thread.gmap_int_code, current->thread.gmap_teid.val); } +/* + * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu + * @vcpu: the vCPU whose gmap is to be fixed up + * @gfn: the guest frame number used for memslots (including fake memslots) + * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps + * @flags: FOLL_* flags + * + * Return: 0 on success, < 0 in case of error. + * Context: The mm lock must not be held before calling. May sleep. + */ +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags) +{ + struct kvm_memory_slot *slot; + unsigned int fault_flags; + bool writable, unlocked; + unsigned long vmaddr; + struct page *page; + kvm_pfn_t pfn; + int rc; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return vcpu_post_run_addressing_exception(vcpu); + + fault_flags = flags & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; + if (vcpu->arch.gmap->pfault_enabled) + flags |= FOLL_NOWAIT; + vmaddr = __gfn_to_hva_memslot(slot, gfn); + +try_again: + pfn = __kvm_faultin_pfn(slot, gfn, flags, &writable, &page); + + /* Access outside memory, inject addressing exception */ + if (is_noslot_pfn(pfn)) + return vcpu_post_run_addressing_exception(vcpu); + /* Signal pending: try again */ + if (pfn == KVM_PFN_ERR_SIGPENDING) + return -EAGAIN; + + /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ + if (pfn == KVM_PFN_ERR_NEEDS_IO) { + trace_kvm_s390_major_guest_pfault(vcpu); + if (kvm_arch_setup_async_pf(vcpu)) + return 0; + vcpu->stat.pfault_sync++; + /* Could not setup async pfault, try again synchronously */ + flags &= ~FOLL_NOWAIT; + goto try_again; + } + /* Any other error */ + if (is_error_pfn(pfn)) + return -EFAULT; + + /* Success */ + mmap_read_lock(vcpu->arch.gmap->mm); + /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ + rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); + if (!rc) + rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); + scoped_guard(spinlock, &vcpu->kvm->mmu_lock) { + kvm_release_faultin_page(vcpu->kvm, page, false, writable); + } + mmap_read_unlock(vcpu->arch.gmap->mm); + return rc; +} + +static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int flags) +{ + unsigned long gaddr_tmp; + gfn_t gfn; + + gfn = gpa_to_gfn(gaddr); + if (kvm_is_ucontrol(vcpu->kvm)) { + /* + * This translates the per-vCPU guest address into a + * fake guest address, which can then be used with the + * fake memslots that are identity mapping userspace. + * This allows ucontrol VMs to use the normal fault + * resolution path, like normal VMs. + */ + mmap_read_lock(vcpu->arch.gmap->mm); + gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); + mmap_read_unlock(vcpu->arch.gmap->mm); + if (gaddr_tmp == -EFAULT) { + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; + vcpu->run->s390_ucontrol.trans_exc_code = gaddr; + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; + return -EREMOTE; + } + gfn = gpa_to_gfn(gaddr_tmp); + } + return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, flags); +} + static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) { unsigned int flags = 0; unsigned long gaddr; - int rc = 0; gaddr = current->thread.gmap_teid.addr * PAGE_SIZE; if (kvm_s390_cur_gmap_fault_is_write()) @@ -4842,37 +4935,14 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) case PGM_REGION_SECOND_TRANS: case PGM_REGION_THIRD_TRANS: kvm_s390_assert_primary_as(vcpu); - if (vcpu->arch.gmap->pfault_enabled) { - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT); - if (rc == -EFAULT) - return vcpu_post_run_addressing_exception(vcpu); - if (rc == -EAGAIN) { - trace_kvm_s390_major_guest_pfault(vcpu); - if (kvm_arch_setup_async_pf(vcpu)) - return 0; - vcpu->stat.pfault_sync++; - } else { - return rc; - } - } - rc = gmap_fault(vcpu->arch.gmap, gaddr, flags); - if (rc == -EFAULT) { - if (kvm_is_ucontrol(vcpu->kvm)) { - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; - vcpu->run->s390_ucontrol.pgm_code = 0x10; - return -EREMOTE; - } - return vcpu_post_run_addressing_exception(vcpu); - } - break; + return vcpu_dat_fault_handler(vcpu, gaddr, flags); default: KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx", current->thread.gmap_int_code, current->thread.gmap_teid.val); send_sig(SIGSEGV, current, 0); break; } - return rc; + return 0; } static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) @@ -5751,7 +5821,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_S390_VCPU_FAULT: { - r = gmap_fault(vcpu->arch.gmap, arg, 0); + idx = srcu_read_lock(&vcpu->kvm->srcu); + r = vcpu_dat_fault_handler(vcpu, arg, 0); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_ENABLE_CAP: diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 30736ac16f848..3be5291723c8e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -410,6 +410,12 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); +int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); + +static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) +{ + return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); +} /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3e6e25119a964..bfaba77333067 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -605,6 +605,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) radix_tree_preload_end(); return rc; } +EXPORT_SYMBOL(__gmap_link); /** * fixup_user_fault_nowait - manually resolve a user page fault without waiting -- GitLab From 6eb84e130075b9ea35a946dcf9a2476ac2c749a0 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:19 +0100 Subject: [PATCH 087/989] KVM: s390: get rid of gmap_fault() All gmap page faults are already handled in kvm by the function kvm_s390_handle_dat_fault(); only few users of gmap_fault remained, all within kvm. Convert those calls to use kvm_s390_handle_dat_fault() instead. Remove gmap_fault() entirely since it has no more users. Acked-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-8-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-8-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 1 - arch/s390/kvm/intercept.c | 4 +- arch/s390/mm/gmap.c | 124 ----------------------------------- 3 files changed, 2 insertions(+), 127 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 3e66f53fe3ccc..d4572729269f4 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -113,7 +113,6 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); unsigned long gmap_translate(struct gmap *, unsigned long gaddr); int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -int gmap_fault(struct gmap *, unsigned long gaddr, unsigned int fault_flags); void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index acf10aefd08f0..610dd44a948b2 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -368,7 +368,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = gmap_fault(vcpu->arch.gmap, srcaddr, 0); + rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); if (rc != 0) return rc; @@ -377,7 +377,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - rc = gmap_fault(vcpu->arch.gmap, dstaddr, FAULT_FLAG_WRITE); + rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); if (rc != 0) return rc; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index bfaba77333067..e124fca147377 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -607,130 +607,6 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) } EXPORT_SYMBOL(__gmap_link); -/** - * fixup_user_fault_nowait - manually resolve a user page fault without waiting - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * @unlocked: did we unlock the mmap_lock while retrying - * - * This function behaves similarly to fixup_user_fault(), but it guarantees - * that the fault will be resolved without waiting. The function might drop - * and re-acquire the mm lock, in which case @unlocked will be set to true. - * - * The guarantee is that the fault is handled without waiting, but the - * function itself might sleep, due to the lock. - * - * Context: Needs to be called with mm->mmap_lock held in read mode, and will - * return with the lock held in read mode; @unlocked will indicate whether - * the lock has been dropped and re-acquired. This is the same behaviour as - * fixup_user_fault(). - * - * Return: 0 on success, -EAGAIN if the fault cannot be resolved without - * waiting, -EFAULT if the fault cannot be resolved, -ENOMEM if out of - * memory. - */ -static int fixup_user_fault_nowait(struct mm_struct *mm, unsigned long address, - unsigned int fault_flags, bool *unlocked) -{ - struct vm_area_struct *vma; - unsigned int test_flags; - vm_fault_t fault; - int rc; - - fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; - test_flags = fault_flags & FAULT_FLAG_WRITE ? VM_WRITE : VM_READ; - - vma = find_vma(mm, address); - if (unlikely(!vma || address < vma->vm_start)) - return -EFAULT; - if (unlikely(!(vma->vm_flags & test_flags))) - return -EFAULT; - - fault = handle_mm_fault(vma, address, fault_flags, NULL); - /* the mm lock has been dropped, take it again */ - if (fault & VM_FAULT_COMPLETED) { - *unlocked = true; - mmap_read_lock(mm); - return 0; - } - /* the mm lock has not been dropped */ - if (fault & VM_FAULT_ERROR) { - rc = vm_fault_to_errno(fault, 0); - BUG_ON(!rc); - return rc; - } - /* the mm lock has not been dropped because of FAULT_FLAG_RETRY_NOWAIT */ - if (fault & VM_FAULT_RETRY) - return -EAGAIN; - /* nothing needed to be done and the mm lock has not been dropped */ - return 0; -} - -/** - * __gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Context: Needs to be called with mm->mmap_lock held in read mode. Might - * drop and re-acquire the lock. Will always return with the lock held. - */ -static int __gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) -{ - unsigned long vmaddr; - bool unlocked; - int rc = 0; - -retry: - unlocked = false; - - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - - if (fault_flags & FAULT_FLAG_RETRY_NOWAIT) - rc = fixup_user_fault_nowait(gmap->mm, vmaddr, fault_flags, &unlocked); - else - rc = fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked); - if (rc) - return rc; - /* - * In the case that fixup_user_fault unlocked the mmap_lock during - * fault-in, redo __gmap_translate() to avoid racing with a - * map/unmap_segment. - * In particular, __gmap_translate(), fixup_user_fault{,_nowait}(), - * and __gmap_link() must all be called atomically in one go; if the - * lock had been dropped in between, a retry is needed. - */ - if (unlocked) - goto retry; - - return __gmap_link(gmap, gaddr, vmaddr); -} - -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, -EFAULT if the - * vm address is already mapped to a different guest segment, and -EAGAIN if - * FAULT_FLAG_RETRY_NOWAIT was specified and the fault could not be processed - * immediately. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags) -{ - int rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_fault(gmap, gaddr, fault_flags); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_fault); - /* * this function is assumed to be called with mmap_lock held */ -- GitLab From d41993f71385ce7e9661c203e02a588a93a59b24 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:20 +0100 Subject: [PATCH 088/989] KVM: s390: get rid of gmap_translate() Add gpa_to_hva(), which uses memslots, and use it to replace all uses of gmap_translate(). Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-9-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-9-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 1 - arch/s390/kvm/interrupt.c | 19 +++++++++++-------- arch/s390/kvm/kvm-s390.h | 9 +++++++++ arch/s390/mm/gmap.c | 20 -------------------- 4 files changed, 20 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index d4572729269f4..74b48f2e608a8 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -111,7 +111,6 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); -unsigned long gmap_translate(struct gmap *, unsigned long gaddr); int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d4f031e086fc3..07ff0e10cb7f5 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2893,7 +2893,8 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - u64 uaddr; + u64 uaddr_s, uaddr_i; + int idx; switch (ue->type) { /* we store the userspace addresses instead of the guest addresses */ @@ -2901,14 +2902,16 @@ int kvm_set_routing_entry(struct kvm *kvm, if (kvm_is_ucontrol(kvm)) return -EINVAL; e->set = set_adapter_int; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); - if (uaddr == -EFAULT) - return -EFAULT; - e->adapter.summary_addr = uaddr; - uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); - if (uaddr == -EFAULT) + + idx = srcu_read_lock(&kvm->srcu); + uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr); + uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr); + srcu_read_unlock(&kvm->srcu, idx); + + if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i)) return -EFAULT; - e->adapter.ind_addr = uaddr; + e->adapter.summary_addr = uaddr_s; + e->adapter.ind_addr = uaddr_i; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3be5291723c8e..61e8544924b34 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -281,6 +281,15 @@ static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) return gd; } +static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa) +{ + hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + + if (!kvm_is_error_hva(hva)) + hva |= offset_in_page(gpa); + return hva; +} + /* implemented in pv.c */ int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index e124fca147377..7fd298732d1e7 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -463,26 +463,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) } EXPORT_SYMBOL_GPL(__gmap_translate); -/** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) -{ - unsigned long rc; - - mmap_read_lock(gmap->mm); - rc = __gmap_translate(gmap, gaddr); - mmap_read_unlock(gmap->mm); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_translate); - /** * gmap_unlink - disconnect a page table from the gmap shadow tables * @mm: pointer to the parent mm_struct -- GitLab From c9f721ed8ec6942dad951d2d8c4fca291170165e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:21 +0100 Subject: [PATCH 089/989] KVM: s390: move some gmap shadowing functions away from mm/gmap.c Move some gmap shadowing functions from mm/gmap.c to kvm/kvm-s390.c and the newly created kvm/gmap-vsie.c This is a step toward removing gmap from mm. Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-10-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-10-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 9 +- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/gmap-vsie.c | 142 +++++++++++++++++++++ arch/s390/kvm/gmap.h | 20 +++ arch/s390/kvm/kvm-s390.c | 74 ++++++++++- arch/s390/kvm/kvm-s390.h | 2 + arch/s390/kvm/vsie.c | 2 + arch/s390/mm/gmap.c | 238 +++++------------------------------ 8 files changed, 271 insertions(+), 218 deletions(-) create mode 100644 arch/s390/kvm/gmap-vsie.c diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 74b48f2e608a8..dbf2329281d20 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -106,6 +106,8 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); void gmap_remove(struct gmap *gmap); struct gmap *gmap_get(struct gmap *gmap); void gmap_put(struct gmap *gmap); +void gmap_free(struct gmap *gmap); +struct gmap *gmap_alloc(unsigned long limit); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); @@ -118,9 +120,7 @@ void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr) int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level); -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); +void gmap_unshadow(struct gmap *sg); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, @@ -136,8 +136,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -int gmap_mprotect_notify(struct gmap *, unsigned long start, - unsigned long len, int prot); +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits); void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index d972dea657fd1..f0ffe874adc21 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c new file mode 100644 index 0000000000000..a6d1dbb04c970 --- /dev/null +++ b/arch/s390/kvm/gmap-vsie.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Guest memory management for KVM/s390 nested VMs. + * + * Copyright IBM Corp. 2008, 2020, 2024 + * + * Author(s): Claudio Imbrenda + * Martin Schwidefsky + * David Hildenbrand + * Janosch Frank + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "kvm-s390.h" +#include "gmap.h" + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL + * + * Context: Called with parent->shadow_lock held + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg; + + lockdep_assert_held(&parent->shadow_lock); + list_for_each_entry(sg, &parent->children, list) { + if (!gmap_shadow_valid(sg, asce, edat_level)) + continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); + refcount_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || + KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) + return ERR_PTR(-EFAULT); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce, edat_level); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; + new = gmap_alloc(limit); + if (!new) + return ERR_PTR(-ENOMEM); + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->private = parent->private; + new->orig_asce = asce; + new->edat_level = edat_level; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce, edat_level); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } + refcount_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ + mmap_read_lock(parent->mm); + rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1), + PROT_READ, GMAP_NOTIFY_SHADOW); + mmap_read_unlock(parent->mm); + spin_lock(&parent->shadow_lock); + new->initialized = true; + if (rc) { + list_del(&new->list); + gmap_free(new); + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; +} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index f2b52ce29be3d..978f541059f02 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -13,5 +13,25 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); + +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} #endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 70c98bf127541..ebecb96bacce7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -4511,6 +4511,75 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); } +static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) +{ + struct kvm *kvm = gmap->private; + gfn_t gfn = gpa_to_gfn(gaddr); + bool unlocked; + hva_t vmaddr; + gpa_t tmp; + int rc; + + if (kvm_is_ucontrol(kvm)) { + tmp = __gmap_translate(gmap, gaddr); + gfn = gpa_to_gfn(tmp); + } + + vmaddr = gfn_to_hva(kvm, gfn); + rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); + if (!rc) + rc = __gmap_link(gmap, gaddr, vmaddr); + return rc; +} + +/** + * __kvm_s390_mprotect_many() - Apply specified protection to guest pages + * @gmap: the gmap of the guest + * @gpa: the starting guest address + * @npages: how many pages to protect + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() + * + * Context: kvm->srcu and gmap->mm need to be held in read mode + */ +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits) +{ + unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + gpa_t end = gpa + npages * PAGE_SIZE; + int rc; + + for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { + rc = gmap_protect_one(gmap, gpa, prot, bits); + if (rc == -EAGAIN) { + __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); + rc = gmap_protect_one(gmap, gpa, prot, bits); + } + if (rc < 0) + return rc; + } + + return 0; +} + +static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) +{ + gpa_t gaddr = kvm_s390_get_prefix(vcpu); + int idx, rc; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + mmap_read_lock(vcpu->arch.gmap->mm); + + rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); + + mmap_read_unlock(vcpu->arch.gmap->mm); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return rc; +} + static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { retry: @@ -4526,9 +4595,8 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; - rc = gmap_mprotect_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2, PROT_WRITE); + + rc = kvm_s390_mprotect_notify_prefix(vcpu); if (rc) { kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 61e8544924b34..8d3bbb2dd8d27 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -420,6 +420,8 @@ void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); +int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, + unsigned long bits); static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a0398ff85d00b..a78df3a4f3530 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,7 @@ #include #include "kvm-s390.h" #include "gaccess.h" +#include "gmap.h" enum vsie_page_flags { VSIE_PAGE_IN_USE = 0, diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 7fd298732d1e7..ae71b401312bf 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -43,7 +43,7 @@ static struct page *gmap_alloc_crst(void) * * Returns a guest address space structure. */ -static struct gmap *gmap_alloc(unsigned long limit) +struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -97,6 +97,7 @@ static struct gmap *gmap_alloc(unsigned long limit) out: return NULL; } +EXPORT_SYMBOL_GPL(gmap_alloc); /** * gmap_create - create a guest address space @@ -191,7 +192,7 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) * * No locks required. There are no references to this gmap anymore. */ -static void gmap_free(struct gmap *gmap) +void gmap_free(struct gmap *gmap) { struct page *page, *next; @@ -218,6 +219,7 @@ static void gmap_free(struct gmap *gmap) kfree(gmap); } +EXPORT_SYMBOL_GPL(gmap_free); /** * gmap_get - increase reference counter for guest address space @@ -958,86 +960,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * @bits: pgste notification bits to set * - * Returns 0 if successfully protected, -ENOMEM if out of memory and - * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * Returns: + * PAGE_SIZE if a small page was successfully protected; + * HPAGE_SIZE if a large page was successfully protected; + * -ENOMEM if out of memory; + * -EFAULT if gaddr is invalid (or mapping for shadows is missing); + * -EAGAIN if the guest mapping is missing and should be fixed by the caller. * - * Called with sg->mm->mmap_lock in read. + * Context: Called with sg->mm->mmap_lock in read. */ -static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot, unsigned long bits) +int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits) { - unsigned long vmaddr, dist; pmd_t *pmdp; - int rc; + int rc = 0; BUG_ON(gmap_is_shadow(gmap)); - while (len) { - rc = -EAGAIN; - pmdp = gmap_pmd_op_walk(gmap, gaddr); - if (pmdp) { - if (!pmd_leaf(*pmdp)) { - rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - len -= PAGE_SIZE; - gaddr += PAGE_SIZE; - } - } else { - rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, - bits); - if (!rc) { - dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK); - len = len < dist ? 0 : len - dist; - gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE; - } - } - gmap_pmd_op_end(gmap, pmdp); - } - if (rc) { - if (rc == -EINVAL) - return rc; - /* -EAGAIN, fixup of userspace mm and gmap */ - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) - return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); - if (rc) - return rc; - } - } - return 0; -} + pmdp = gmap_pmd_op_walk(gmap, gaddr); + if (!pmdp) + return -EAGAIN; -/** - * gmap_mprotect_notify - change access rights for a range of ptes and - * call the notifier if any pte changes again - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE - * - * Returns 0 if for each page in the given range a gmap mapping exists, - * the new access rights could be set and the notifier could be armed. - * If the gmap mapping is missing for one or more pages -EFAULT is - * returned. If no memory could be allocated -ENOMEM is returned. - * This function establishes missing page table entries. - */ -int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, - unsigned long len, int prot) -{ - int rc; + if (!pmd_leaf(*pmdp)) { + rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = PAGE_SIZE; + } else { + rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits); + if (!rc) + rc = HPAGE_SIZE; + } + gmap_pmd_op_end(gmap, pmdp); - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) - return -EINVAL; - if (!MACHINE_HAS_ESOP && prot == PROT_READ) - return -EINVAL; - mmap_read_lock(gmap->mm); - rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - mmap_read_unlock(gmap->mm); return rc; } -EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +EXPORT_SYMBOL_GPL(gmap_protect_one); /** * gmap_read_table - get an unsigned long value from a guest page table using @@ -1488,7 +1444,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, * * Called with sg->guest_table_lock */ -static void gmap_unshadow(struct gmap *sg) +void gmap_unshadow(struct gmap *sg) { unsigned long *table; @@ -1514,143 +1470,7 @@ static void gmap_unshadow(struct gmap *sg) break; } } - -/** - * gmap_find_shadow - find a specific asce in the list of shadow tables - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, ERR_PTR(-EAGAIN) if another one is just being created, - * otherwise NULL - */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg; - - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->edat_level != edat_level || - sg->removed) - continue; - if (!sg->initialized) - return ERR_PTR(-EAGAIN); - refcount_inc(&sg->ref_count); - return sg; - } - return NULL; -} - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} -EXPORT_SYMBOL_GPL(gmap_shadow_valid); - -/** - * gmap_shadow - create/find a shadow guest address space - * @parent: pointer to the parent gmap - * @asce: ASCE for which the shadow table is created - * @edat_level: edat level to be used for the shadow translation - * - * The pages of the top level page table referred by the asce parameter - * will be set to read-only and marked in the PGSTEs of the kvm process. - * The shadow table will be removed automatically on any change to the - * PTE mapping for the source table. - * - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the - * parent gmap table could not be protected. - */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, - int edat_level) -{ - struct gmap *sg, *new; - unsigned long limit; - int rc; - - BUG_ON(parent->mm->context.allow_gmap_hpage_1m); - BUG_ON(gmap_is_shadow(parent)); - spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce, edat_level); - spin_unlock(&parent->shadow_lock); - if (sg) - return sg; - /* Create a new shadow gmap */ - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); - if (asce & _ASCE_REAL_SPACE) - limit = -1UL; - new = gmap_alloc(limit); - if (!new) - return ERR_PTR(-ENOMEM); - new->mm = parent->mm; - new->parent = gmap_get(parent); - new->private = parent->private; - new->orig_asce = asce; - new->edat_level = edat_level; - new->initialized = false; - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce, edat_level); - if (sg) { - spin_unlock(&parent->shadow_lock); - gmap_free(new); - return sg; - } - if (asce & _ASCE_REAL_SPACE) { - /* only allow one real-space gmap shadow */ - list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce & _ASCE_REAL_SPACE) { - spin_lock(&sg->guest_table_lock); - gmap_unshadow(sg); - spin_unlock(&sg->guest_table_lock); - list_del(&sg->list); - gmap_put(sg); - break; - } - } - } - refcount_set(&new->ref_count, 2); - list_add(&new->list, &parent->children); - if (asce & _ASCE_REAL_SPACE) { - /* nothing to protect, return right away */ - new->initialized = true; - spin_unlock(&parent->shadow_lock); - return new; - } - spin_unlock(&parent->shadow_lock); - /* protect after insertion, so it will get properly invalidated */ - mmap_read_lock(parent->mm); - rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, - PROT_READ, GMAP_NOTIFY_SHADOW); - mmap_read_unlock(parent->mm); - spin_lock(&parent->shadow_lock); - new->initialized = true; - if (rc) { - list_del(&new->list); - gmap_free(new); - new = ERR_PTR(rc); - } - spin_unlock(&parent->shadow_lock); - return new; -} -EXPORT_SYMBOL_GPL(gmap_shadow); +EXPORT_SYMBOL(gmap_unshadow); /** * gmap_shadow_r2t - create an empty shadow region 2 table -- GitLab From 37d1b5d8d588a9761e47d9941005e2da7def8310 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:22 +0100 Subject: [PATCH 090/989] KVM: s390: stop using page->index for non-shadow gmaps The host_to_guest radix tree will now map userspace addresses to guest addresses, instead of userspace addresses to segment tables. When segment tables and page tables are needed, they are found using an additional gmap_table_walk(). This gets rid of all usage of page->index for non-shadow gmaps. Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-11-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-11-imbrenda@linux.ibm.com> --- arch/s390/mm/gmap.c | 105 +++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index ae71b401312bf..1f83262a5a552 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -24,8 +24,20 @@ #include #include +/* + * The address is saved in a radix tree directly; NULL would be ambiguous, + * since 0 is a valid address, and NULL is returned when nothing was found. + * The lower bits are ignored by all users of the macro, so it can be used + * to distinguish a valid address 0 from a NULL. + */ +#define VALID_GADDR_FLAG 1 +#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG) +#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG) + #define GMAP_SHADOW_FAKE_TABLE 1ULL +static inline unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); + static struct page *gmap_alloc_crst(void) { struct page *page; @@ -82,7 +94,6 @@ struct gmap *gmap_alloc(unsigned long limit) page = gmap_alloc_crst(); if (!page) goto out_free; - page->index = 0; list_add(&page->lru, &gmap->crst_list); table = page_to_virt(page); crst_table_init(table, etype); @@ -303,7 +314,6 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, list_add(&page->lru, &gmap->crst_list); *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; page = NULL; } spin_unlock(&gmap->guest_table_lock); @@ -312,21 +322,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return 0; } -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr) { - struct page *page; - unsigned long offset; + return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - page = pmd_pgtable_page((pmd_t *) entry); - return page->index + offset; +static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr) +{ + return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); +} + +static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr, + unsigned long *gaddr) +{ + *gaddr = host_to_guest_delete(gmap, vmaddr); + if (IS_GADDR_VALID(*gaddr)) + return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1); + return NULL; } /** @@ -338,16 +350,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry) */ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) { - unsigned long *entry; + unsigned long gaddr; int flush = 0; + pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_EMPTY); - *entry = _SEGMENT_ENTRY_EMPTY; + + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { + flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY); + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } + spin_unlock(&gmap->guest_table_lock); return flush; } @@ -564,7 +579,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) spin_lock(&gmap->guest_table_lock); if (*table == _SEGMENT_ENTRY_EMPTY) { rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); + vmaddr >> PMD_SHIFT, + (void *)MAKE_VALID_GADDR(gaddr)); if (!rc) { if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & @@ -1995,7 +2011,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte, unsigned long bits) { unsigned long offset, gaddr = 0; - unsigned long *table; struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); @@ -2003,12 +2018,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (table) - gaddr = __gmap_segment_gaddr(table) + offset; + gaddr = host_to_guest_lookup(gmap, vmaddr) + offset; spin_unlock(&gmap->guest_table_lock); - if (!table) + if (!IS_GADDR_VALID(gaddr)) continue; if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { @@ -2068,10 +2080,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); if (pmdp) { - gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | _SEGMENT_ENTRY_GMAP_UC | @@ -2115,28 +2125,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp); */ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); else if (MACHINE_HAS_IDTE) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2151,22 +2158,19 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local); */ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *entry, gaddr; + unsigned long gaddr; struct gmap *gmap; pmd_t *pmdp; rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (entry) { - pmdp = (pmd_t *)entry; - gaddr = __gmap_segment_gaddr(entry); + pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr); + if (pmdp) { pmdp_notify_gmap(gmap, pmdp, gaddr); - WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC | - _SEGMENT_ENTRY)); + WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2174,7 +2178,7 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *entry = _SEGMENT_ENTRY_EMPTY; + *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY); } spin_unlock(&gmap->guest_table_lock); } @@ -2690,7 +2694,6 @@ int s390_replace_asce(struct gmap *gmap) page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = 0; table = page_to_virt(page); memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); -- GitLab From ef0c8ef8485d9629c6d042cea8f2082f159b467e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:23 +0100 Subject: [PATCH 091/989] KVM: s390: stop using lists to keep track of used dat tables Until now, every dat table allocated to map a guest was put in a linked list. The page->lru field of struct page was used to keep track of which pages were being used, and when the gmap is torn down, the list was walked and all pages freed. This patch gets rid of the usage of page->lru. Page tables are now freed by recursively walking the dat table tree. Since s390_unlist_old_asce() becomes useless now, remove it. Acked-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-12-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-12-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 5 -- arch/s390/mm/gmap.c | 102 ++++++++--------------------------- arch/s390/mm/pgalloc.c | 2 - 3 files changed, 23 insertions(+), 86 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index dbf2329281d20..b489c45896187 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -23,7 +23,6 @@ /** * struct gmap_struct - guest address space * @list: list head for the mm->context gmap list - * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries @@ -35,7 +34,6 @@ * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures - * @pt_list: list of all page tables used in the shadow guest address space * @shadow_lock: spinlock to protect the shadow gmap list * @parent: pointer to the parent gmap for shadow guest address spaces * @orig_asce: ASCE for which the shadow page table has been created @@ -45,7 +43,6 @@ */ struct gmap { struct list_head list; - struct list_head crst_list; struct mm_struct *mm; struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; @@ -61,7 +58,6 @@ struct gmap { /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; - struct list_head pt_list; spinlock_t shadow_lock; struct gmap *parent; unsigned long orig_asce; @@ -141,7 +137,6 @@ int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); int s390_disable_cow_sharing(void); -void s390_unlist_old_asce(struct gmap *gmap); int s390_replace_asce(struct gmap *gmap); void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 1f83262a5a552..07df1a7b5ebe5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -82,9 +82,7 @@ struct gmap *gmap_alloc(unsigned long limit) gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; - INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); - INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); @@ -94,7 +92,6 @@ struct gmap *gmap_alloc(unsigned long limit) page = gmap_alloc_crst(); if (!page) goto out_free; - list_add(&page->lru, &gmap->crst_list); table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; @@ -197,6 +194,27 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_free_crst(unsigned long *table, bool free_ptes) +{ + bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0; + int i; + + if (is_segment) { + if (!free_ptes) + goto out; + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _SEGMENT_ENTRY_INVALID)) + page_table_free_pgste(page_ptdesc(phys_to_page(table[i]))); + } else { + for (i = 0; i < _CRST_ENTRIES; i++) + if (!(table[i] & _REGION_ENTRY_INVALID)) + gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes); + } + +out: + free_pages((unsigned long)table, CRST_ALLOC_ORDER); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure @@ -205,24 +223,17 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) */ void gmap_free(struct gmap *gmap) { - struct page *page, *next; - /* Flush tlb of all gmaps (if not already done for shadows) */ if (!(gmap_is_shadow(gmap) && gmap->removed)) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, CRST_ALLOC_ORDER); + gmap_free_crst(gmap->table, gmap_is_shadow(gmap)); + gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { - struct ptdesc *ptdesc, *n; - - /* Free all page tables. */ - list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) - page_table_free_pgste(ptdesc); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -311,7 +322,6 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); page = NULL; @@ -1243,7 +1253,6 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ ptdesc = page_ptdesc(phys_to_page(pgt)); - list_del(&ptdesc->pt_list); page_table_free_pgste(ptdesc); } @@ -1271,7 +1280,6 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ ptdesc = page_ptdesc(phys_to_page(pgt)); - list_del(&ptdesc->pt_list); page_table_free_pgste(ptdesc); } } @@ -1301,7 +1309,6 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ page = phys_to_page(sgt); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1329,7 +1336,6 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ page = phys_to_page(sgt); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1359,7 +1365,6 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ page = phys_to_page(r3t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1387,7 +1392,6 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ page = phys_to_page(r3t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1417,7 +1421,6 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ page = phys_to_page(r2t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1449,7 +1452,6 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ page = phys_to_page(r2t); - list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1544,7 +1546,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1628,7 +1629,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1712,7 +1712,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; - list_add(&page->lru, &sg->crst_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_REGION_ENTRY_INVALID; @@ -1833,7 +1832,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&ptdesc->pt_list, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2623,49 +2621,6 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, } EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -/** - * s390_unlist_old_asce - Remove the topmost level of page tables from the - * list of page tables of the gmap. - * @gmap: the gmap whose table is to be removed - * - * On s390x, KVM keeps a list of all pages containing the page tables of the - * gmap (the CRST list). This list is used at tear down time to free all - * pages that are now not needed anymore. - * - * This function removes the topmost page of the tree (the one pointed to by - * the ASCE) from the CRST list. - * - * This means that it will not be freed when the VM is torn down, and needs - * to be handled separately by the caller, unless a leak is actually - * intended. Notice that this function will only remove the page from the - * list, the page will still be used as a top level page table (and ASCE). - */ -void s390_unlist_old_asce(struct gmap *gmap) -{ - struct page *old; - - old = virt_to_page(gmap->table); - spin_lock(&gmap->guest_table_lock); - list_del(&old->lru); - /* - * Sometimes the topmost page might need to be "removed" multiple - * times, for example if the VM is rebooted into secure mode several - * times concurrently, or if s390_replace_asce fails after calling - * s390_remove_old_asce and is attempted again later. In that case - * the old asce has been removed from the list, and therefore it - * will not be freed when the VM terminates, but the ASCE is still - * in use and still pointed to. - * A subsequent call to replace_asce will follow the pointer and try - * to remove the same page from the list again. - * Therefore it's necessary that the page of the ASCE has valid - * pointers, so list_del can work (and do nothing) without - * dereferencing stale or invalid pointers. - */ - INIT_LIST_HEAD(&old->lru); - spin_unlock(&gmap->guest_table_lock); -} -EXPORT_SYMBOL_GPL(s390_unlist_old_asce); - /** * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy * @gmap: the gmap whose ASCE needs to be replaced @@ -2685,8 +2640,6 @@ int s390_replace_asce(struct gmap *gmap) struct page *page; void *table; - s390_unlist_old_asce(gmap); - /* Replacing segment type ASCEs would cause serious issues */ if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) return -EINVAL; @@ -2697,15 +2650,6 @@ int s390_replace_asce(struct gmap *gmap) table = page_to_virt(page); memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); - /* - * The caller has to deal with the old ASCE, but here we make sure - * the new one is properly added to the CRST list, so that - * it will be freed when the VM is torn down. - */ - spin_lock(&gmap->guest_table_lock); - list_add(&page->lru, &gmap->crst_list); - spin_unlock(&gmap->guest_table_lock); - /* Set new table origin while preserving existing ASCE control bits */ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); WRITE_ONCE(gmap->asce, asce); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index cd2fef79ad2c7..30387a6e98ffd 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -176,8 +176,6 @@ unsigned long *page_table_alloc(struct mm_struct *mm) } table = ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); - /* pt_list is used by gmap only */ - INIT_LIST_HEAD(&ptdesc->pt_list); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; -- GitLab From 43656f774a4b4a2841035947e89dcde8ee136caa Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:24 +0100 Subject: [PATCH 092/989] KVM: s390: move gmap_shadow_pgt_lookup() into kvm Move gmap_shadow_pgt_lookup() from mm/gmap.c into kvm/gaccess.c . Reviewed-by: Steffen Eiden Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-13-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-13-imbrenda@linux.ibm.com> --- arch/s390/include/asm/gmap.h | 3 +-- arch/s390/kvm/gaccess.c | 42 +++++++++++++++++++++++++++++++- arch/s390/kvm/gmap.h | 2 ++ arch/s390/mm/gmap.c | 46 ++---------------------------------- 4 files changed, 46 insertions(+), 47 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index b489c45896187..4e73ef46d4b2a 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -125,8 +125,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake); -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, int *fake); int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); @@ -142,6 +140,7 @@ void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool interruptible); int kvm_s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split); +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); /** * s390_uv_destroy_range - Destroy a range of pages in the given mm. diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9816b0060fbe5..bb1340389369c 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -16,6 +16,7 @@ #include #include #include "kvm-s390.h" +#include "gmap.h" #include "gaccess.h" /* @@ -1392,6 +1393,42 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return 0; } +/** + * shadow_pgt_lookup() - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_lock in read. + */ +static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, + int *dat_protection, int *fake) +{ + unsigned long *table; + struct page *page; + int rc; + + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; +} + /** * kvm_s390_shadow_fault - handle fault on a shadow page table * @vcpu: virtual cpu @@ -1415,6 +1452,9 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, int dat_protection, fake; int rc; + if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) + return -EFAULT; + mmap_read_lock(sg->mm); /* * We don't want any guest-2 tables to change - so the parent @@ -1423,7 +1463,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, */ ipte_lock(vcpu->kvm); - rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); + rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, &fake); diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 978f541059f02..c8f031c9ea5f2 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -10,6 +10,8 @@ #ifndef ARCH_KVM_S390_GMAP_H #define ARCH_KVM_S390_GMAP_H +#define GMAP_SHADOW_FAKE_TABLE 1ULL + int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 07df1a7b5ebe5..918ea14515a1e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -36,8 +36,6 @@ #define GMAP_SHADOW_FAKE_TABLE 1ULL -static inline unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level); - static struct page *gmap_alloc_crst(void) { struct page *page; @@ -738,8 +736,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * * Note: Can also be called for shadow gmaps. */ -static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr, int level) +unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { const int asce_type = gmap->asce & _ASCE_TYPE_MASK; unsigned long *table = gmap->table; @@ -790,6 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, } return table; } +EXPORT_SYMBOL(gmap_table_walk); /** * gmap_pte_op_walk - walk the gmap page table, get the page table lock @@ -1744,46 +1742,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); -/** - * gmap_shadow_pgt_lookup - find a shadow page table - * @sg: pointer to the shadow guest address space structure - * @saddr: the address in the shadow aguest address space - * @pgt: parent gmap address of the page table to get shadowed - * @dat_protection: if the pgtable is marked as protected by dat - * @fake: pgt references contiguous guest memory block, not a pgtable - * - * Returns 0 if the shadow page table was found and -EAGAIN if the page - * table was not found. - * - * Called with sg->mm->mmap_lock in read. - */ -int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection, - int *fake) -{ - unsigned long *table; - struct page *page; - int rc; - - BUG_ON(!gmap_is_shadow(sg)); - spin_lock(&sg->guest_table_lock); - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { - /* Shadow page tables are full pages (pte+pgste) */ - page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); - rc = 0; - } else { - rc = -EAGAIN; - } - spin_unlock(&sg->guest_table_lock); - return rc; - -} -EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); - /** * gmap_shadow_pgt - instantiate a shadow page table * @sg: pointer to the shadow guest address space structure -- GitLab From c27e002626b9fbd2729fa00ddda789319648e7ba Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:25 +0100 Subject: [PATCH 093/989] KVM: s390: remove useless page->index usage The page->index field for VSIE dat tables is only used for segment tables. Stop setting the field for all region tables. Reviewed-by: Janosch Frank Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-14-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-14-imbrenda@linux.ibm.com> --- arch/s390/mm/gmap.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 918ea14515a1e..38f0443217040 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1520,9 +1520,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r2t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1603,9 +1600,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = r3t & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1686,9 +1680,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, page = gmap_alloc_crst(); if (!page) return -ENOMEM; - page->index = sgt & _REGION_ENTRY_ORIGIN; - if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); -- GitLab From 1f4389931e9fea7e8b3c1f189d505b040b25be8a Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:26 +0100 Subject: [PATCH 094/989] KVM: s390: move PGSTE softbits Move the softbits in the PGSTEs to the other usable area. This leaves the 16-bit block of usable bits free, which will be used in the next patch for something else. Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-15-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-15-imbrenda@linux.ibm.com> --- arch/s390/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a3b51056a1778..a96bde2e5f18d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -420,9 +420,9 @@ void setup_protection_map(void); #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL -#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ -#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ -#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ +#define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ +#define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL -- GitLab From 84b7387692a8c849bd8bddd0f5c5474d4923aa6e Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Thu, 23 Jan 2025 15:46:27 +0100 Subject: [PATCH 095/989] KVM: s390: remove the last user of page->index Shadow page tables use page->index to keep the g2 address of the guest page table being shadowed. Instead of keeping the information in page->index, split the address and smear it over the 16-bit softbits areas of 4 PGSTEs. This removes the last s390 user of page->index. Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250123144627.312456-16-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250123144627.312456-16-imbrenda@linux.ibm.com> --- arch/s390/include/asm/pgtable.h | 15 +++++++++++++++ arch/s390/kvm/gaccess.c | 6 ++++-- arch/s390/mm/gmap.c | 22 ++++++++++++++++++++-- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a96bde2e5f18d..3ca5af4cfe432 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -420,6 +420,7 @@ void setup_protection_map(void); #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL +#define PGSTE_ST2_MASK 0x0000ffff00000000UL #define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ #define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */ @@ -2007,4 +2008,18 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define pmd_pgtable(pmd) \ ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) +static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt) +{ + unsigned long *pgstes, res; + + pgstes = pgt + _PAGE_ENTRIES; + + res = (pgstes[0] & PGSTE_ST2_MASK) << 16; + res |= pgstes[1] & PGSTE_ST2_MASK; + res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16; + res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32; + + return res; +} + #endif /* _S390_PAGE_H */ diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index bb1340389369c..f6fded15633ad 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1409,6 +1409,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake) { + unsigned long pt_index; unsigned long *table; struct page *page; int rc; @@ -1418,9 +1419,10 @@ static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { /* Shadow page tables are full pages (pte+pgste) */ page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; + pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); + *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); - *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); + *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); rc = 0; } else { rc = -EAGAIN; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 38f0443217040..94d9277858009 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1733,6 +1733,23 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); +static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr) +{ + unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc)); + + pgstes += _PAGE_ENTRIES; + + pgstes[0] &= ~PGSTE_ST2_MASK; + pgstes[1] &= ~PGSTE_ST2_MASK; + pgstes[2] &= ~PGSTE_ST2_MASK; + pgstes[3] &= ~PGSTE_ST2_MASK; + + pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK; + pgstes[1] |= pgt_addr & PGSTE_ST2_MASK; + pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK; + pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK; +} + /** * gmap_shadow_pgt - instantiate a shadow page table * @sg: pointer to the shadow guest address space structure @@ -1760,9 +1777,10 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, ptdesc = page_table_alloc_pgste(sg->mm); if (!ptdesc) return -ENOMEM; - ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; + origin = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; + origin |= GMAP_SHADOW_FAKE_TABLE; + gmap_pgste_set_pgt_addr(ptdesc, origin); s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); -- GitLab From 32239066776a27287837a193b37c6e55259e5c10 Mon Sep 17 00:00:00 2001 From: Christoph Schlameuss Date: Tue, 28 Jan 2025 14:18:03 +0100 Subject: [PATCH 096/989] KVM: s390: selftests: Streamline uc_skey test to issue iske after sske In some rare situations a non default storage key is already set on the memory used by the test. Within normal VMs the key is reset / zapped when the memory is added to the VM. This is not the case for ucontrol VMs. With the initial iske check removed this test case can work in all situations. The function of the iske instruction is still validated by the remaining code. Fixes: 0185fbc6a2d3 ("KVM: s390: selftests: Add uc_skey VM test case") Signed-off-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250128131803.1047388-1-schlameuss@linux.ibm.com Message-ID: <20250128131803.1047388-1-schlameuss@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- .../selftests/kvm/s390/ucontrol_test.c | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/s390/ucontrol_test.c b/tools/testing/selftests/kvm/s390/ucontrol_test.c index 22ce9219620ce..d265b34c54be8 100644 --- a/tools/testing/selftests/kvm/s390/ucontrol_test.c +++ b/tools/testing/selftests/kvm/s390/ucontrol_test.c @@ -88,10 +88,6 @@ asm("test_skey_asm:\n" " ahi %r0,1\n" " st %r1,0(%r5,%r6)\n" - " iske %r1,%r6\n" - " ahi %r0,1\n" - " diag 0,0,0x44\n" - " sske %r1,%r6\n" " xgr %r1,%r1\n" " iske %r1,%r6\n" @@ -600,7 +596,9 @@ TEST_F(uc_kvm, uc_skey) ASSERT_EQ(true, uc_handle_exit(self)); ASSERT_EQ(1, sync_regs->gprs[0]); - /* ISKE */ + /* SSKE + ISKE */ + sync_regs->gprs[1] = skeyvalue; + run->kvm_dirty_regs |= KVM_SYNC_GPRS; ASSERT_EQ(0, uc_run_once(self)); /* @@ -612,21 +610,11 @@ TEST_F(uc_kvm, uc_skey) TEST_ASSERT_EQ(0, sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)); TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason); TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode); - TEST_REQUIRE(sie_block->ipa != 0xb229); + TEST_REQUIRE(sie_block->ipa != 0xb22b); - /* ISKE contd. */ + /* SSKE + ISKE contd. */ ASSERT_EQ(false, uc_handle_exit(self)); ASSERT_EQ(2, sync_regs->gprs[0]); - /* assert initial skey (ACC = 0, R & C = 1) */ - ASSERT_EQ(0x06, sync_regs->gprs[1]); - uc_assert_diag44(self); - - /* SSKE + ISKE */ - sync_regs->gprs[1] = skeyvalue; - run->kvm_dirty_regs |= KVM_SYNC_GPRS; - ASSERT_EQ(0, uc_run_once(self)); - ASSERT_EQ(false, uc_handle_exit(self)); - ASSERT_EQ(3, sync_regs->gprs[0]); ASSERT_EQ(skeyvalue, sync_regs->gprs[1]); uc_assert_diag44(self); @@ -635,7 +623,7 @@ TEST_F(uc_kvm, uc_skey) run->kvm_dirty_regs |= KVM_SYNC_GPRS; ASSERT_EQ(0, uc_run_once(self)); ASSERT_EQ(false, uc_handle_exit(self)); - ASSERT_EQ(4, sync_regs->gprs[0]); + ASSERT_EQ(3, sync_regs->gprs[0]); /* assert R reset but rest of skey unchanged */ ASSERT_EQ(skeyvalue & 0xfa, sync_regs->gprs[1]); ASSERT_EQ(0, sync_regs->gprs[1] & 0x04); -- GitLab From 6daaae5ff7f3b23a2dacc9c387ff3d4f95b67cad Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 29 Jan 2025 10:51:48 +0100 Subject: [PATCH 097/989] gpu: drm_dp_cec: fix broken CEC adapter properties check If the hotplug detect of a display is low for longer than one second (configurable through drm_dp_cec_unregister_delay), then the CEC adapter is unregistered since we assume the display was disconnected. If the HPD went low for less than one second, then we check if the properties of the CEC adapter have changed, since that indicates that we actually switch to new hardware and we have to unregister the old CEC device and register a new one. Unfortunately, the test for changed properties was written poorly, and after a new CEC capability was added to the CEC core code the test always returned true (i.e. the properties had changed). As a result the CEC device was unregistered and re-registered for every HPD toggle. If the CEC remote controller integration was also enabled (CONFIG_MEDIA_CEC_RC was set), then the corresponding input device was also unregistered and re-registered. As a result the input device in /sys would keep incrementing its number, e.g.: /sys/devices/pci0000:00/0000:00:08.1/0000:e7:00.0/rc/rc0/input20 Since short HPD toggles are common, the number could over time get into the thousands. While not a serious issue (i.e. nothing crashes), it is not intended to work that way. This patch changes the test so that it only checks for the single CEC capability that can actually change, and it ignores any other capabilities, so this is now safe as well if new caps are added in the future. With the changed test the bit under #ifndef CONFIG_MEDIA_CEC_RC can be dropped as well, so that's a nice cleanup. Signed-off-by: Hans Verkuil Reported-by: Farblos Reviewed-by: Dmitry Baryshkov Fixes: 2c6d1fffa1d9 ("drm: add support for DisplayPort CEC-Tunneling-over-AUX") Tested-by: Farblos Link: https://patchwork.freedesktop.org/patch/msgid/361bb03d-1691-4e23-84da-0861ead5dbdc@xs4all.nl Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/display/drm_dp_cec.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/display/drm_dp_cec.c b/drivers/gpu/drm/display/drm_dp_cec.c index 007ceb281d00d..56a4965e518cc 100644 --- a/drivers/gpu/drm/display/drm_dp_cec.c +++ b/drivers/gpu/drm/display/drm_dp_cec.c @@ -311,16 +311,6 @@ void drm_dp_cec_attach(struct drm_dp_aux *aux, u16 source_physical_address) if (!aux->transfer) return; -#ifndef CONFIG_MEDIA_CEC_RC - /* - * CEC_CAP_RC is part of CEC_CAP_DEFAULTS, but it is stripped by - * cec_allocate_adapter() if CONFIG_MEDIA_CEC_RC is undefined. - * - * Do this here as well to ensure the tests against cec_caps are - * correct. - */ - cec_caps &= ~CEC_CAP_RC; -#endif cancel_delayed_work_sync(&aux->cec.unregister_work); mutex_lock(&aux->cec.lock); @@ -337,7 +327,9 @@ void drm_dp_cec_attach(struct drm_dp_aux *aux, u16 source_physical_address) num_las = CEC_MAX_LOG_ADDRS; if (aux->cec.adap) { - if (aux->cec.adap->capabilities == cec_caps && + /* Check if the adapter properties have changed */ + if ((aux->cec.adap->capabilities & CEC_CAP_MONITOR_ALL) == + (cec_caps & CEC_CAP_MONITOR_ALL) && aux->cec.adap->available_log_addrs == num_las) { /* Unchanged, so just set the phys addr */ cec_s_phys_addr(aux->cec.adap, source_physical_address, false); -- GitLab From 743bbd93cf29f653fae0e1416a31f03231689911 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 23 Jan 2025 16:01:16 +0100 Subject: [PATCH 098/989] ice: put Rx buffers after being done with current frame Introduce a new helper ice_put_rx_mbuf() that will go through gathered frags from current frame and will call ice_put_rx_buf() on them. Current logic that was supposed to simplify and optimize the driver where we go through a batch of all buffers processed in current NAPI instance turned out to be broken for jumbo frames and very heavy load that was coming from both multi-thread iperf and nginx/wrk pair between server and client. The delay introduced by approach that we are dropping is simply too big and we need to take the decision regarding page recycling/releasing as quick as we can. While at it, address an error path of ice_add_xdp_frag() - we were missing buffer putting from day 1 there. As a nice side effect we get rid of annoying and repetitive three-liner: xdp->data = NULL; rx_ring->first_desc = ntc; rx_ring->nr_frags = 0; by embedding it within introduced routine. Fixes: 1dc1a7e7f410 ("ice: Centrallize Rx buffer recycling") Reported-and-tested-by: Xu Du Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Co-developed-by: Jacob Keller Signed-off-by: Jacob Keller Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 79 ++++++++++++++--------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 5d2d7736fd5f1..e173d9c989883 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1103,6 +1103,49 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) rx_buf->page = NULL; } +/** + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * @rx_ring: Rx ring with all the auxiliary data + * @xdp: XDP buffer carrying linear + frags part + * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage + * @ntc: a current next_to_clean value to be stored at rx_ring + * + * Walk through gathered fragments and satisfy internal page + * recycle mechanism; we take here an action related to verdict + * returned by XDP program; + */ +static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + u32 *xdp_xmit, u32 ntc) +{ + u32 nr_frags = rx_ring->nr_frags + 1; + u32 idx = rx_ring->first_desc; + u32 cnt = rx_ring->count; + struct ice_rx_buf *buf; + int i; + + for (i = 0; i < nr_frags; i++) { + buf = &rx_ring->rx_buf[idx]; + + if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) { + ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); + *xdp_xmit |= buf->act; + } else if (buf->act & ICE_XDP_CONSUMED) { + buf->pagecnt_bias++; + } else if (buf->act == ICE_XDP_PASS) { + ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); + } + + ice_put_rx_buf(rx_ring, buf); + + if (++idx == cnt) + idx = 0; + } + + xdp->data = NULL; + rx_ring->first_desc = ntc; + rx_ring->nr_frags = 0; +} + /** * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @rx_ring: Rx descriptor ring to transact packets on @@ -1120,7 +1163,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int offset = rx_ring->rx_offset; struct xdp_buff *xdp = &rx_ring->xdp; - u32 cached_ntc = rx_ring->first_desc; struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; @@ -1128,7 +1170,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) u32 xdp_xmit = 0; u32 cached_ntu; bool failure; - u32 first; xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (xdp_prog) { @@ -1190,6 +1231,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { + ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc); break; } if (++ntc == cnt) @@ -1205,9 +1247,8 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc); + continue; construct_skb: if (likely(ice_ring_uses_build_skb(rx_ring))) @@ -1221,14 +1262,11 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) if (unlikely(xdp_buff_has_frags(xdp))) ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; - break; } - xdp->data = NULL; - rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc); + + if (!skb) + break; stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); if (unlikely(ice_test_staterr(rx_desc->wb.status_error0, @@ -1257,23 +1295,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) total_rx_pkts++; } - first = rx_ring->first_desc; - while (cached_ntc != first) { - struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc]; - - if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - xdp_xmit |= buf->act; - } else if (buf->act & ICE_XDP_CONSUMED) { - buf->pagecnt_bias++; - } else if (buf->act == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } - - ice_put_rx_buf(rx_ring, buf); - if (++cached_ntc >= cnt) - cached_ntc = 0; - } rx_ring->next_to_clean = ntc; /* return up to cleaned_count buffers to hardware */ failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring)); -- GitLab From 11c4aa074d547d825b19cd8d9f288254d89d805c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 23 Jan 2025 16:01:17 +0100 Subject: [PATCH 099/989] ice: gather page_count()'s of each frag right before XDP prog call If we store the pgcnt on few fragments while being in the middle of gathering the whole frame and we stumbled upon DD bit not being set, we terminate the NAPI Rx processing loop and come back later on. Then on next NAPI execution we work on previously stored pgcnt. Imagine that second half of page was used actively by networking stack and by the time we came back, stack is not busy with this page anymore and decremented the refcnt. The page reuse algorithm in this case should be good to reuse the page but given the old refcnt it will not do so and attempt to release the page via page_frag_cache_drain() with pagecnt_bias used as an arg. This in turn will result in negative refcnt on struct page, which was initially observed by Xu Du. Therefore, move the page count storage from ice_get_rx_buf() to a place where we are sure that whole frame has been collected, but before calling XDP program as it internally can also change the page count of fragments belonging to xdp_buff. Fixes: ac0753391195 ("ice: Store page count inside ice_rx_buf") Reported-and-tested-by: Xu Du Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Co-developed-by: Jacob Keller Signed-off-by: Jacob Keller Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 27 ++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index e173d9c989883..cf46bcf143b4b 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -924,7 +924,6 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, struct ice_rx_buf *rx_buf; rx_buf = &rx_ring->rx_buf[ntc]; - rx_buf->pgcnt = page_count(rx_buf->page); prefetchw(rx_buf->page); if (!size) @@ -940,6 +939,31 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, return rx_buf; } +/** + * ice_get_pgcnts - grab page_count() for gathered fragments + * @rx_ring: Rx descriptor ring to store the page counts on + * + * This function is intended to be called right before running XDP + * program so that the page recycling mechanism will be able to take + * a correct decision regarding underlying pages; this is done in such + * way as XDP program can change the refcount of page + */ +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +{ + u32 nr_frags = rx_ring->nr_frags + 1; + u32 idx = rx_ring->first_desc; + struct ice_rx_buf *rx_buf; + u32 cnt = rx_ring->count; + + for (int i = 0; i < nr_frags; i++) { + rx_buf = &rx_ring->rx_buf[idx]; + rx_buf->pgcnt = page_count(rx_buf->page); + + if (++idx == cnt) + idx = 0; + } +} + /** * ice_build_skb - Build skb around an existing buffer * @rx_ring: Rx descriptor ring to transact packets on @@ -1241,6 +1265,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) if (ice_is_non_eop(rx_ring, rx_desc)) continue; + ice_get_pgcnts(rx_ring); ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc); if (rx_buf->act == ICE_XDP_PASS) goto construct_skb; -- GitLab From 468a1952df78f65c5991b7ac885c8b5b7dd87bab Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 23 Jan 2025 16:01:18 +0100 Subject: [PATCH 100/989] ice: stop storing XDP verdict within ice_rx_buf Idea behind having ice_rx_buf::act was to simplify and speed up the Rx data path by walking through buffers that were representing cleaned HW Rx descriptors. Since it caused us a major headache recently and we rolled back to old approach that 'puts' Rx buffers right after running XDP prog/creating skb, this is useless now and should be removed. Get rid of ice_rx_buf::act and related logic. We still need to take care of a corner case where XDP program releases a particular fragment. Make ice_run_xdp() to return its result and use it within ice_put_rx_mbuf(). Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Signed-off-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 62 +++++++++++-------- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 43 ------------- 3 files changed, 36 insertions(+), 70 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index cf46bcf143b4b..9c9ea4c1b93b7 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -527,15 +527,14 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) * @xdp: xdp_buff used as input to the XDP program * @xdp_prog: XDP program to run * @xdp_ring: ring to be used for XDP_TX action - * @rx_buf: Rx buffer to store the XDP action * @eop_desc: Last descriptor in packet to read metadata from * * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} */ -static void +static u32 ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring, - struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc) + union ice_32b_rx_flex_desc *eop_desc) { unsigned int ret = ICE_XDP_PASS; u32 act; @@ -574,7 +573,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, ret = ICE_XDP_CONSUMED; } exit: - ice_set_rx_bufs_act(xdp, rx_ring, ret); + return ret; } /** @@ -860,10 +859,8 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, xdp_buff_set_frags_flag(xdp); } - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) return -ENOMEM; - } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); @@ -1075,12 +1072,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) rx_buf->page_offset + headlen, size, xdp->frame_sz); } else { - /* buffer is unused, change the act that should be taken later - * on; data was copied onto skb's linear part so there's no + /* buffer is unused, restore biased page count in Rx buffer; + * data was copied onto skb's linear part so there's no * need for adjusting page offset and we can reuse this buffer * as-is */ - rx_buf->act = ICE_SKB_CONSUMED; + rx_buf->pagecnt_bias++; } if (unlikely(xdp_buff_has_frags(xdp))) { @@ -1133,29 +1130,34 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) * @xdp: XDP buffer carrying linear + frags part * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage * @ntc: a current next_to_clean value to be stored at rx_ring + * @verdict: return code from XDP program execution * * Walk through gathered fragments and satisfy internal page * recycle mechanism; we take here an action related to verdict * returned by XDP program; */ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - u32 *xdp_xmit, u32 ntc) + u32 *xdp_xmit, u32 ntc, u32 verdict) { u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; + u32 post_xdp_frags = 1; struct ice_rx_buf *buf; int i; - for (i = 0; i < nr_frags; i++) { + if (unlikely(xdp_buff_has_frags(xdp))) + post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + + for (i = 0; i < post_xdp_frags; i++) { buf = &rx_ring->rx_buf[idx]; - if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) { + if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - *xdp_xmit |= buf->act; - } else if (buf->act & ICE_XDP_CONSUMED) { + *xdp_xmit |= verdict; + } else if (verdict & ICE_XDP_CONSUMED) { buf->pagecnt_bias++; - } else if (buf->act == ICE_XDP_PASS) { + } else if (verdict == ICE_XDP_PASS) { ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); } @@ -1164,6 +1166,17 @@ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, if (++idx == cnt) idx = 0; } + /* handle buffers that represented frags released by XDP prog; + * for these we keep pagecnt_bias as-is; refcount from struct page + * has been decremented within XDP prog and we do not have to increase + * the biased refcnt + */ + for (; i < nr_frags; i++) { + buf = &rx_ring->rx_buf[idx]; + ice_put_rx_buf(rx_ring, buf); + if (++idx == cnt) + idx = 0; + } xdp->data = NULL; rx_ring->first_desc = ntc; @@ -1190,9 +1203,9 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; + u32 cached_ntu, xdp_verdict; u32 cnt = rx_ring->count; u32 xdp_xmit = 0; - u32 cached_ntu; bool failure; xdp_prog = READ_ONCE(rx_ring->xdp_prog); @@ -1255,7 +1268,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { - ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc); + ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); break; } if (++ntc == cnt) @@ -1266,13 +1279,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) continue; ice_get_pgcnts(rx_ring); - ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc); - if (rx_buf->act == ICE_XDP_PASS) + xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); + if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc); + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); continue; construct_skb: @@ -1283,12 +1296,9 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->ring_stats->rx_stats.alloc_page_failed++; - rx_buf->act = ICE_XDP_CONSUMED; - if (unlikely(xdp_buff_has_frags(xdp))) - ice_set_rx_bufs_act(xdp, rx_ring, - ICE_XDP_CONSUMED); + xdp_verdict = ICE_XDP_CONSUMED; } - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc); + ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); if (!skb) break; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index cb347c852ba9e..806bce701df34 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -201,7 +201,6 @@ struct ice_rx_buf { struct page *page; unsigned int page_offset; unsigned int pgcnt; - unsigned int act; unsigned int pagecnt_bias; }; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index 79f960c6680d1..6cf32b4041275 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -5,49 +5,6 @@ #define _ICE_TXRX_LIB_H_ #include "ice.h" -/** - * ice_set_rx_bufs_act - propagate Rx buffer action to frags - * @xdp: XDP buffer representing frame (linear and frags part) - * @rx_ring: Rx ring struct - * act: action to store onto Rx buffers related to XDP buffer parts - * - * Set action that should be taken before putting Rx buffer from first frag - * to the last. - */ -static inline void -ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, - const unsigned int act) -{ - u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - u32 nr_frags = rx_ring->nr_frags + 1; - u32 idx = rx_ring->first_desc; - u32 cnt = rx_ring->count; - struct ice_rx_buf *buf; - - for (int i = 0; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - buf->act = act; - - if (++idx == cnt) - idx = 0; - } - - /* adjust pagecnt_bias on frags freed by XDP prog */ - if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { - u32 delta = rx_ring->nr_frags - sinfo_frags; - - while (delta) { - if (idx == 0) - idx = cnt - 1; - else - idx--; - buf = &rx_ring->rx_buf[idx]; - buf->pagecnt_bias--; - delta--; - } - } -} - /** * ice_test_staterr - tests bits in Rx descriptor status and error fields * @status_err_n: Rx descriptor status_error0 or status_error1 bits -- GitLab From 2d1a2dab95cdc6f2e0c6af3c0514b0bea94af482 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 28 Jan 2025 07:22:31 -0800 Subject: [PATCH 101/989] nvme: make nvme_tls_attrs_group static To suppress the compiler "warning: symbol 'nvme_tls_attrs_group' was not declared. Should it be static?" Fixes: 1e48b34c9bc79a ("nvme: split off TLS sysfs attributes into a separate group") Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index b68a9e5f1ea39..3a41b9ab0f13c 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -792,7 +792,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, return a->mode; } -const struct attribute_group nvme_tls_attrs_group = { +static const struct attribute_group nvme_tls_attrs_group = { .attrs = nvme_tls_attrs, .is_visible = nvme_tls_attrs_are_visible, }; -- GitLab From c8ed6cb5d37bc09c7e25e49a670e9fd1a3bd1dfa Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 28 Jan 2025 17:34:47 +0100 Subject: [PATCH 102/989] nvme-fc: use ctrl state getter Do not access the state variable directly, instead use proper synchronization so not stale data is read. Fixes: e6e7f7ac03e4 ("nvme: ensure reset state check ordering") Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 55884d3df6f29..f4f1866fbd5b8 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2087,7 +2087,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) nvme_fc_complete_rq(rq); check_error: - if (terminate_assoc && ctrl->ctrl.state != NVME_CTRL_RESETTING) + if (terminate_assoc && + nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING) queue_work(nvme_reset_wq, &ctrl->ioerr_work); } @@ -2541,6 +2542,8 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { + enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl); + /* * if an error (io timeout, etc) while (re)connecting, the remote * port requested terminating of the association (disconnect_ls) @@ -2548,7 +2551,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) * the controller. Abort any ios on the association and let the * create_association error path resolve things. */ - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { + if (state == NVME_CTRL_CONNECTING) { __nvme_fc_abort_outstanding_ios(ctrl, true); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport error during (re)connect\n", @@ -2557,7 +2560,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) } /* Otherwise, only proceed if in LIVE state - e.g. on first error */ - if (ctrl->ctrl.state != NVME_CTRL_LIVE) + if (state != NVME_CTRL_LIVE) return; dev_warn(ctrl->ctrl.device, -- GitLab From a572593ac80e51eb69ecede7e614289fcccdbf8d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 29 Jan 2025 14:56:35 -0800 Subject: [PATCH 103/989] md: Fix linear_set_limits() queue_limits_cancel_update() must only be called if queue_limits_start_update() is called first. Remove the queue_limits_cancel_update() call from linear_set_limits() because there is no corresponding queue_limits_start_update() call. This bug was discovered by annotating all mutex operations with clang thread-safety attributes and by building the kernel with clang and -Wthread-safety. Cc: Yu Kuai Cc: Coly Li Cc: Mike Snitzer Cc: Christoph Hellwig Fixes: 127186cfb184 ("md: reintroduce md-linear") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250129225636.2667932-1-bvanassche@acm.org Signed-off-by: Song Liu --- drivers/md/md-linear.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index a382929ce7bab..369aed044b409 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -76,10 +76,8 @@ static int linear_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } -- GitLab From 0f1a6c5c9784eff7e31e4915e17285fb89ad3644 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 31 Jan 2025 22:29:24 +0000 Subject: [PATCH 104/989] KVM: arm64: Flush/sync debug state in protected mode The recent changes to debug state management broke self-hosted debug for guests when running in protected mode, since both the debug owner and the debug state itself aren't shared with the hyp's view of the vcpu. Fix it by flushing/syncing the relevant bits with the hyp vcpu. Fixes: beb470d96cec ("KVM: arm64: Use debug_owner to track if debug regs need save/restore") Reported-by: Mark Brown Closes: https://lore.kernel.org/kvmarm/5f62740f-a065-42d9-9f56-8fb648b9c63f@sirena.org.uk/ Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250131222922.1548780-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 5c134520e1805..6e12c070832f7 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -91,11 +91,34 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; } +static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + hyp_vcpu->vcpu.arch.debug_owner = host_vcpu->arch.debug_owner; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.vcpu_debug_state = host_vcpu->arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + hyp_vcpu->vcpu.arch.external_debug_state = host_vcpu->arch.external_debug_state; +} + +static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + if (kvm_guest_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.vcpu_debug_state = hyp_vcpu->vcpu.arch.vcpu_debug_state; + else if (kvm_host_owns_debug_regs(&hyp_vcpu->vcpu)) + host_vcpu->arch.external_debug_state = hyp_vcpu->vcpu.arch.external_debug_state; +} + static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) { struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; fpsimd_sve_flush(); + flush_debug_state(hyp_vcpu); hyp_vcpu->vcpu.arch.ctxt = host_vcpu->arch.ctxt; @@ -123,6 +146,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) unsigned int i; fpsimd_sve_sync(&hyp_vcpu->vcpu); + sync_debug_state(hyp_vcpu); host_vcpu->arch.ctxt = hyp_vcpu->vcpu.arch.ctxt; -- GitLab From 46ded709232344b5750a852747a8881763c721ab Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 29 Jan 2025 15:13:42 -0800 Subject: [PATCH 105/989] net: bcmgenet: Correct overlaying of PHY and MAC Wake-on-LAN Some Wake-on-LAN modes such as WAKE_FILTER may only be supported by the MAC, while others might be only supported by the PHY. Make sure that the .get_wol() returns the union of both rather than only that of the PHY if the PHY supports Wake-on-LAN. When disabling Wake-on-LAN, make sure that this is done at both the PHY and MAC level, rather than doing an early return from the PHY driver. Fixes: 7e400ff35cbe ("net: bcmgenet: Add support for PHY-based Wake-on-LAN") Fixes: 9ee09edc05f2 ("net: bcmgenet: Properly overlay PHY and MAC Wake-on-LAN capabilities") Signed-off-by: Florian Fainelli Link: https://patch.msgid.link/20250129231342.35013-1-florian.fainelli@broadcom.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/broadcom/genet/bcmgenet_wol.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c index 0715ea5bf13ed..3b082114f2e53 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c @@ -41,9 +41,12 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) { struct bcmgenet_priv *priv = netdev_priv(dev); struct device *kdev = &priv->pdev->dev; + u32 phy_wolopts = 0; - if (dev->phydev) + if (dev->phydev) { phy_ethtool_get_wol(dev->phydev, wol); + phy_wolopts = wol->wolopts; + } /* MAC is not wake-up capable, return what the PHY does */ if (!device_can_wakeup(kdev)) @@ -51,9 +54,14 @@ void bcmgenet_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) /* Overlay MAC capabilities with that of the PHY queried before */ wol->supported |= WAKE_MAGIC | WAKE_MAGICSECURE | WAKE_FILTER; - wol->wolopts = priv->wolopts; - memset(wol->sopass, 0, sizeof(wol->sopass)); + wol->wolopts |= priv->wolopts; + /* Return the PHY configured magic password */ + if (phy_wolopts & WAKE_MAGICSECURE) + return; + + /* Otherwise the MAC one */ + memset(wol->sopass, 0, sizeof(wol->sopass)); if (wol->wolopts & WAKE_MAGICSECURE) memcpy(wol->sopass, priv->sopass, sizeof(priv->sopass)); } @@ -70,7 +78,7 @@ int bcmgenet_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) /* Try Wake-on-LAN from the PHY first */ if (dev->phydev) { ret = phy_ethtool_set_wol(dev->phydev, wol); - if (ret != -EOPNOTSUPP) + if (ret != -EOPNOTSUPP && wol->wolopts) return ret; } -- GitLab From c71a192976ded2f2f416d03c4f595cdd4478b825 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jan 2025 19:15:18 -0800 Subject: [PATCH 106/989] net: ipv6: fix dst refleaks in rpl, seg6 and ioam6 lwtunnels dst_cache_get() gives us a reference, we need to release it. Discovered by the ioam6.sh test, kmemleak was recently fixed to catch per-cpu memory leaks. Fixes: 985ec6f5e623 ("net: ipv6: rpl_iptunnel: mitigate 2-realloc issue") Fixes: 40475b63761a ("net: ipv6: seg6_iptunnel: mitigate 2-realloc issue") Fixes: dce525185bc9 ("net: ipv6: ioam6_iptunnel: mitigate 2-realloc issue") Reviewed-by: Justin Iurman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250130031519.2716843-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/ioam6_iptunnel.c | 5 +++-- net/ipv6/rpl_iptunnel.c | 6 ++++-- net/ipv6/seg6_iptunnel.c | 6 ++++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 28e5a89dc2557..3936c137a5727 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -336,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst; + struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; @@ -407,7 +407,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) cache_dst = ip6_route_output(net, NULL, &fl6); if (cache_dst->error) { err = cache_dst->error; - dst_release(cache_dst); goto drop; } @@ -426,8 +425,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst_output(net, sk, skb); } out: + dst_release(cache_dst); return dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(cache_dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 7ba22d2f2bfef..9b7d035631154 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -232,7 +232,6 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } @@ -251,6 +250,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -269,8 +269,10 @@ static int rpl_input(struct sk_buff *skb) local_bh_enable(); err = rpl_do_srh(skb, rlwt, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 4bf937bfc2633..eacc4e91b48ef 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -482,8 +482,10 @@ static int seg6_input_core(struct net *net, struct sock *sk, local_bh_enable(); err = seg6_do_srh(skb, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); @@ -571,7 +573,6 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } @@ -593,6 +594,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } -- GitLab From 92191dd1073088753821b862b791dcc83e558e07 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jan 2025 19:15:19 -0800 Subject: [PATCH 107/989] net: ipv6: fix dst ref loops in rpl, seg6 and ioam6 lwtunnels Some lwtunnels have a dst cache for post-transformation dst. If the packet destination did not change we may end up recording a reference to the lwtunnel in its own cache, and the lwtunnel state will never be freed. Discovered by the ioam6.sh test, kmemleak was recently fixed to catch per-cpu memory leaks. I'm not sure if rpl and seg6 can actually hit this, but in principle I don't see why not. Fixes: 8cb3bf8bff3c ("ipv6: ioam: Add support for the ip6ip6 encapsulation") Fixes: 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels") Fixes: a7a29f9c361f ("net: ipv6: add rpl sr tunnel") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250130031519.2716843-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/ioam6_iptunnel.c | 9 ++++++--- net/ipv6/rpl_iptunnel.c | 9 ++++++--- net/ipv6/seg6_iptunnel.c | 9 ++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 3936c137a5727..2c383c12a4315 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -410,9 +410,12 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (dst->lwtstate != cache_dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); if (unlikely(err)) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 9b7d035631154..0ac4283acdf20 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -235,9 +235,12 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index eacc4e91b48ef..33833b2064c07 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -576,9 +576,12 @@ static int seg6_output_core(struct net *net, struct sock *sk, goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) -- GitLab From a8aa6a6ddce9b5585f2b74f27f3feea1427fb4e7 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Fri, 31 Jan 2025 01:38:32 +0000 Subject: [PATCH 108/989] ice: Add check for devm_kzalloc() Add check for the return value of devm_kzalloc() to guarantee the success of allocation. Fixes: 42c2eb6b1f43 ("ice: Implement devlink-rate API") Signed-off-by: Jiasheng Jiang Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250131013832.24805-1-jiashengjiangcool@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/devlink/devlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index d116e2b10bcea..dbdb83567364c 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -981,6 +981,9 @@ static int ice_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv /* preallocate memory for ice_sched_node */ node = devm_kzalloc(ice_hw_to_dev(pi->hw), sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + *priv = node; return 0; -- GitLab From 3f1baa91a1fdf3de9dbad4bd615b35fab347874b Mon Sep 17 00:00:00 2001 From: Sankararaman Jayaraman Date: Fri, 31 Jan 2025 09:53:41 +0530 Subject: [PATCH 109/989] vmxnet3: Fix tx queue race condition with XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If XDP traffic runs on a CPU which is greater than or equal to the number of the Tx queues of the NIC, then vmxnet3_xdp_get_tq() always picks up queue 0 for transmission as it uses reciprocal scale instead of simple modulo operation. vmxnet3_xdp_xmit() and vmxnet3_xdp_xmit_frame() use the above returned queue without any locking which can lead to race conditions when multiple XDP xmits run in parallel on different CPU's. This patch uses a simple module scheme when the current CPU equals or exceeds the number of Tx queues on the NIC. It also adds locking in vmxnet3_xdp_xmit() and vmxnet3_xdp_xmit_frame() functions. Fixes: 54f00cce1178 ("vmxnet3: Add XDP support.") Signed-off-by: Sankararaman Jayaraman Signed-off-by: Ronak Doshi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250131042340.156547-1-sankararaman.jayaraman@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_xdp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 1341374a4588a..616ecc38d1726 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -28,7 +28,7 @@ vmxnet3_xdp_get_tq(struct vmxnet3_adapter *adapter) if (likely(cpu < tq_number)) tq = &adapter->tx_queue[cpu]; else - tq = &adapter->tx_queue[reciprocal_scale(cpu, tq_number)]; + tq = &adapter->tx_queue[cpu % tq_number]; return tq; } @@ -124,6 +124,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, u32 buf_size; u32 dw2; + spin_lock_irq(&tq->tx_lock); dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT; dw2 |= xdpf->len; ctx.sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill; @@ -134,6 +135,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, if (vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) == 0) { tq->stats.tx_ring_full++; + spin_unlock_irq(&tq->tx_lock); return -ENOSPC; } @@ -142,8 +144,10 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, tbi->dma_addr = dma_map_single(&adapter->pdev->dev, xdpf->data, buf_size, DMA_TO_DEVICE); - if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr)) + if (dma_mapping_error(&adapter->pdev->dev, tbi->dma_addr)) { + spin_unlock_irq(&tq->tx_lock); return -EFAULT; + } tbi->map_type |= VMXNET3_MAP_SINGLE; } else { /* XDP buffer from page pool */ page = virt_to_page(xdpf->data); @@ -182,6 +186,7 @@ vmxnet3_xdp_xmit_frame(struct vmxnet3_adapter *adapter, dma_wmb(); gdesc->dword[2] = cpu_to_le32(le32_to_cpu(gdesc->dword[2]) ^ VMXNET3_TXD_GEN); + spin_unlock_irq(&tq->tx_lock); /* No need to handle the case when tx_num_deferred doesn't reach * threshold. Backend driver at hypervisor side will poll and reset @@ -225,6 +230,7 @@ vmxnet3_xdp_xmit(struct net_device *dev, { struct vmxnet3_adapter *adapter = netdev_priv(dev); struct vmxnet3_tx_queue *tq; + struct netdev_queue *nq; int i; if (unlikely(test_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state))) @@ -236,6 +242,9 @@ vmxnet3_xdp_xmit(struct net_device *dev, if (tq->stopped) return -ENETDOWN; + nq = netdev_get_tx_queue(adapter->netdev, tq->qid); + + __netif_tx_lock(nq, smp_processor_id()); for (i = 0; i < n; i++) { if (vmxnet3_xdp_xmit_frame(adapter, frames[i], tq, true)) { tq->stats.xdp_xmit_err++; @@ -243,6 +252,7 @@ vmxnet3_xdp_xmit(struct net_device *dev, } } tq->stats.xdp_xmit += i; + __netif_tx_unlock(nq); return i; } -- GitLab From 7faf14a7b0366f153284db0ad3347c457ea70136 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Sun, 26 Jan 2025 17:47:22 +0800 Subject: [PATCH 110/989] nfsd: clear acl_access/acl_default after releasing them If getting acl_default fails, acl_access and acl_default will be released simultaneously. However, acl_access will still retain a pointer pointing to the released posix_acl, which will trigger a WARNING in nfs3svc_release_getacl like this: ------------[ cut here ]------------ refcount_t: underflow; use-after-free. WARNING: CPU: 26 PID: 3199 at lib/refcount.c:28 refcount_warn_saturate+0xb5/0x170 Modules linked in: CPU: 26 UID: 0 PID: 3199 Comm: nfsd Not tainted 6.12.0-rc6-00079-g04ae226af01f-dirty #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014 RIP: 0010:refcount_warn_saturate+0xb5/0x170 Code: cc cc 0f b6 1d b3 20 a5 03 80 fb 01 0f 87 65 48 d8 00 83 e3 01 75 e4 48 c7 c7 c0 3b 9b 85 c6 05 97 20 a5 03 01 e8 fb 3e 30 ff <0f> 0b eb cd 0f b6 1d 8a3 RSP: 0018:ffffc90008637cd8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff83904fde RDX: dffffc0000000000 RSI: 0000000000000008 RDI: ffff88871ed36380 RBP: ffff888158beeb40 R08: 0000000000000001 R09: fffff520010c6f56 R10: ffffc90008637ab7 R11: 0000000000000001 R12: 0000000000000001 R13: ffff888140e77400 R14: ffff888140e77408 R15: ffffffff858b42c0 FS: 0000000000000000(0000) GS:ffff88871ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000562384d32158 CR3: 000000055cc6a000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? refcount_warn_saturate+0xb5/0x170 ? __warn+0xa5/0x140 ? refcount_warn_saturate+0xb5/0x170 ? report_bug+0x1b1/0x1e0 ? handle_bug+0x53/0xa0 ? exc_invalid_op+0x17/0x40 ? asm_exc_invalid_op+0x1a/0x20 ? tick_nohz_tick_stopped+0x1e/0x40 ? refcount_warn_saturate+0xb5/0x170 ? refcount_warn_saturate+0xb5/0x170 nfs3svc_release_getacl+0xc9/0xe0 svc_process_common+0x5db/0xb60 ? __pfx_svc_process_common+0x10/0x10 ? __rcu_read_unlock+0x69/0xa0 ? __pfx_nfsd_dispatch+0x10/0x10 ? svc_xprt_received+0xa1/0x120 ? xdr_init_decode+0x11d/0x190 svc_process+0x2a7/0x330 svc_handle_xprt+0x69d/0x940 svc_recv+0x180/0x2d0 nfsd+0x168/0x200 ? __pfx_nfsd+0x10/0x10 kthread+0x1a2/0x1e0 ? kthread+0xf4/0x1e0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Kernel panic - not syncing: kernel: panic_on_warn set ... Clear acl_access/acl_default after posix_acl_release is called to prevent UAF from being triggered. Fixes: a257cdd0e217 ("[PATCH] NFSD: Add server support for NFSv3 ACLs.") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241107014705.2509463-1-lilingfeng@huaweicloud.com/ Signed-off-by: Li Lingfeng Reviewed-by: Rick Macklem Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs2acl.c | 2 ++ fs/nfsd/nfs3acl.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 4e3be7201b1c4..5fb202acb0fd0 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -84,6 +84,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp) fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 5e34e98db969d..7b5433bd30197 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -76,6 +76,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp) fail: posix_acl_release(resp->acl_access); posix_acl_release(resp->acl_default); + resp->acl_access = NULL; + resp->acl_default = NULL; goto out; } -- GitLab From b9382e29ca538b879645899ce45d652a304e2ed2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 25 Jan 2025 20:13:18 -0500 Subject: [PATCH 111/989] nfsd: validate the nfsd_serv pointer before calling svc_wake_up nfsd_file_dispose_list_delayed can be called from the filecache laundrette, which is shut down after the nfsd threads are shut down and the nfsd_serv pointer is cleared. If nn->nfsd_serv is NULL then there are no threads to wake. Ensure that the nn->nfsd_serv pointer is non-NULL before calling svc_wake_up in nfsd_file_dispose_list_delayed. This is safe since the svc_serv is not freed until after the filecache laundrette is cancelled. Reported-by: Salvatore Bonaccorso Closes: https://bugs.debian.org/1093734 Fixes: ffb402596147 ("nfsd: Don't leave work of closing files to a work queue") Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a1cdba42c4fad..78f4b5573b909 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -445,11 +445,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) struct nfsd_file, nf_gc); struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id); struct nfsd_fcache_disposal *l = nn->fcache_disposal; + struct svc_serv *serv; spin_lock(&l->lock); list_move_tail(&nf->nf_gc, &l->freeme); spin_unlock(&l->lock); - svc_wake_up(nn->nfsd_serv); + + /* + * The filecache laundrette is shut down after the + * nn->nfsd_serv pointer is cleared, but before the + * svc_serv is freed. + */ + serv = nn->nfsd_serv; + if (serv) + svc_wake_up(serv); } } -- GitLab From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 30 Jan 2025 16:05:42 -0800 Subject: [PATCH 112/989] cgroup: fix race between fork and cgroup.kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- include/linux/sched/task.h | 1 + kernel/cgroup/cgroup.c | 20 ++++++++++++-------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7cc..17960a1e858db 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,9 +71,6 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, - - /* Control group has to be killed. */ - CGRP_KILL, }; /* cgroup_root->flags */ @@ -461,6 +458,9 @@ struct cgroup { int nr_threaded_children; /* # of live threaded child cgroups */ + /* sequence number for cgroup.kill, serialized by css_set_lock. */ + unsigned int kill_seq; + struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0f2aeb37bbb04..ca1db4b92c324 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436b..afc665b7b1fe5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); -- GitLab From 029b6ce733712a41421955194b113f283dcb1026 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 2 Feb 2025 12:37:48 +0900 Subject: [PATCH 113/989] sched_ext: Fix incorrect time delta calculation in time_delta() When (s64)(after - before) > 0, the code returns the result of (s64)(after - before) > 0 while the intended result should be (s64)(after - before). That happens because the middle operand of the ternary operator was omitted incorrectly, returning the result of (s64)(after - before) > 0. Thus, add the middle operand -- (s64)(after - before) -- to return the correct time calculation. Fixes: d07be814fc71 ("sched_ext: Add time helpers for BPF schedulers") Signed-off-by: Changwoo Min Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index f254a39b86a58..d72b60a0c582c 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -432,7 +432,7 @@ void bpf_rcu_read_unlock(void) __ksym; */ static inline s64 time_delta(u64 after, u64 before) { - return (s64)(after - before) > 0 ? : 0; + return (s64)(after - before) > 0 ? (s64)(after - before) : 0; } /** -- GitLab From 64b48ec36dbed561ab1cd99708c33d96f4b7b729 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 3 Feb 2025 12:47:17 +1100 Subject: [PATCH 114/989] drivers/block/sunvdc.c: update the correct AIP call My sparc64 defconfig build failed like this: drivers/block/sunvdc.c: In function 'vdc_queue_drain': drivers/block/sunvdc.c:1130:9: error: too many arguments to function 'blk_mq_unquiesce_queue' 1130 | blk_mq_unquiesce_queue(q, memflags); | ^~~~~~~~~~~~~~~~~~~~~~ In file included from drivers/block/sunvdc.c:10: include/linux/blk-mq.h:895:6: note: declared here 895 | void blk_mq_unquiesce_queue(struct request_queue *q); | ^~~~~~~~~~~~~~~~~~~~~~ drivers/block/sunvdc.c:1131:9: error: too few arguments to function 'blk_mq_unfreeze_queue' 1131 | blk_mq_unfreeze_queue(q); | ^~~~~~~~~~~~~~~~~~~~~ In file included from drivers/block/sunvdc.c:10: include/linux/blk-mq.h:914:1: note: declared here 914 | blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) | ^~~~~~~~~~~~~~~~~~~~~ Fixes: 1e1a9cecfab3 ("block: force noio scope in blk_mq_freeze_queue") Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Stephen Rothwell Signed-off-by: Jens Axboe --- drivers/block/sunvdc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 05c4aee7f262a..654ed962a772f 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -1127,8 +1127,8 @@ static void vdc_queue_drain(struct vdc_port *port) spin_lock_irq(&port->vio.lock); port->drain = 0; - blk_mq_unquiesce_queue(q, memflags); - blk_mq_unfreeze_queue(q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); } static void vdc_ldc_reset_timer_work(struct work_struct *work) -- GitLab From f0ada00a9b3801b71d203b0033b7612b687b7d72 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:44 +0530 Subject: [PATCH 115/989] dt-bindings: clock: qcom: Add GPU clocks for QCS8300 The QCS8300 GPU clock controller is a derivative of SA8775P, but has few additional clocks and minor differences. Hence, reuse gpucc bindings of SA8775P and add additional clocks required for QCS8300. Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-1-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/clock/qcom,gpucc.yaml | 3 +++ include/dt-bindings/clock/qcom,qcs8300-gpucc.h | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-gpucc.h diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml index 0858fd6352822..4cdff6161bf0b 100644 --- a/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.yaml @@ -8,6 +8,7 @@ title: Qualcomm Graphics Clock & Reset Controller maintainers: - Taniya Das + - Imran Shaik description: | Qualcomm graphics clock control module provides the clocks, resets and power @@ -23,10 +24,12 @@ description: | include/dt-bindings/clock/qcom,gpucc-sm8150.h include/dt-bindings/clock/qcom,gpucc-sm8250.h include/dt-bindings/clock/qcom,gpucc-sm8350.h + include/dt-bindings/clock/qcom,qcs8300-gpucc.h properties: compatible: enum: + - qcom,qcs8300-gpucc - qcom,sdm845-gpucc - qcom,sa8775p-gpucc - qcom,sc7180-gpucc diff --git a/include/dt-bindings/clock/qcom,qcs8300-gpucc.h b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h new file mode 100644 index 0000000000000..afa187467b4c1 --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-gpucc.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H +#define _DT_BINDINGS_CLK_QCOM_GPUCC_QCS8300_H + +#include "qcom,sa8775p-gpucc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* GPU_CC clocks */ +#define GPU_CC_CX_ACCU_SHIFT_CLK 23 +#define GPU_CC_GX_ACCU_SHIFT_CLK 24 + +#endif -- GitLab From 0e193cc558e32a879c717bb2d53a1cf8628b5d20 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:46 +0530 Subject: [PATCH 116/989] dt-bindings: clock: qcom: Add CAMCC clocks for QCS8300 The QCS8300 camera clock controller is a derivative of SA8775P, but has an additional clock and minor differences. Hence, reuse the SA8775P camera bindings and add additional clock required for QCS8300. Reviewed-by: Vladimir Zapolskiy Acked-by: Krzysztof Kozlowski Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-3-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../bindings/clock/qcom,sa8775p-camcc.yaml | 6 +++++- include/dt-bindings/clock/qcom,qcs8300-camcc.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 include/dt-bindings/clock/qcom,qcs8300-camcc.h diff --git a/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml index 36a60d8f5ae3a..81623f59d11d7 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sa8775p-camcc.yaml @@ -8,16 +8,20 @@ title: Qualcomm Camera Clock & Reset Controller on SA8775P maintainers: - Taniya Das + - Imran Shaik description: | Qualcomm camera clock control module provides the clocks, resets and power domains on SA8775p. - See also: include/dt-bindings/clock/qcom,sa8775p-camcc.h + See also: + include/dt-bindings/clock/qcom,qcs8300-camcc.h + include/dt-bindings/clock/qcom,sa8775p-camcc.h properties: compatible: enum: + - qcom,qcs8300-camcc - qcom,sa8775p-camcc clocks: diff --git a/include/dt-bindings/clock/qcom,qcs8300-camcc.h b/include/dt-bindings/clock/qcom,qcs8300-camcc.h new file mode 100644 index 0000000000000..fc535c8478591 --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs8300-camcc.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H +#define _DT_BINDINGS_CLK_QCOM_QCS8300_CAM_CC_H + +#include "qcom,sa8775p-camcc.h" + +/* QCS8300 introduces below new clocks compared to SA8775P */ + +/* CAM_CC clocks */ +#define CAM_CC_TITAN_TOP_ACCU_SHIFT_CLK 86 + +#endif -- GitLab From 3e86e57356f0e2284454d82c7200807c6fa9e65b Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 9 Jan 2025 14:27:48 +0530 Subject: [PATCH 117/989] dt-bindings: clock: qcom: Add QCS8300 video clock controller The QCS8300 video clock controller is a derivative of SA8775P, but QCS8300 has minor difference. Hence, reuse the SA8775P videocc bindings for QCS8300 platform. Acked-by: Krzysztof Kozlowski Reviewed-by: Dmitry Baryshkov Signed-off-by: Imran Shaik Link: https://lore.kernel.org/r/20250109-qcs8300-mm-patches-new-v4-5-63e8ac268b02@quicinc.com Signed-off-by: Rob Herring (Arm) --- .../devicetree/bindings/clock/qcom,sa8775p-videocc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml index 928131bff4c19..07e5d811d8161 100644 --- a/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml +++ b/Documentation/devicetree/bindings/clock/qcom,sa8775p-videocc.yaml @@ -18,6 +18,7 @@ description: | properties: compatible: enum: + - qcom,qcs8300-videocc - qcom,sa8775p-videocc clocks: -- GitLab From e6649328dc07bff6227367eda6f1b2263d6c10f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Jan 2025 14:35:27 +0100 Subject: [PATCH 118/989] of: address: Add kunit test for __of_address_resource_bounds() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The overflow checking has to deal with different datatypes and edgecases. Add a new kunit testcase to make sure it works correctly. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250129-of-address-overflow-v3-1-95d1760ed791@linutronix.de Signed-off-by: Rob Herring (Arm) --- drivers/of/address.c | 5 +- drivers/of/of_private.h | 4 ++ drivers/of/of_test.c | 119 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 125833e5ce52e..d177a2b9edaf8 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -16,6 +16,8 @@ #include #include /* for bus_dma_region */ +#include + /* Uncomment me to enable of_dump_addr() debugging output */ // #define DEBUG @@ -183,7 +185,7 @@ static u64 of_bus_pci_map(__be32 *addr, const __be32 *range, int na, int ns, #endif /* CONFIG_PCI */ -static int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) +VISIBLE_IF_KUNIT int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) { if (overflows_type(start, r->start)) return -EOVERFLOW; @@ -197,6 +199,7 @@ static int __of_address_resource_bounds(struct resource *r, u64 start, u64 size) return 0; } +EXPORT_SYMBOL_IF_KUNIT(__of_address_resource_bounds); /* * of_pci_range_to_resource - Create a resource from an of_pci_range diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index f3e1193c8ded4..1bdc7ceef3c5f 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -208,4 +208,8 @@ static void __maybe_unused of_dump_addr(const char *s, const __be32 *addr, int n static void __maybe_unused of_dump_addr(const char *s, const __be32 *addr, int na) { } #endif +#if IS_ENABLED(CONFIG_KUNIT) +int __of_address_resource_bounds(struct resource *r, u64 start, u64 size); +#endif + #endif /* _LINUX_OF_PRIVATE_H */ diff --git a/drivers/of/of_test.c b/drivers/of/of_test.c index b0557ded838fd..8bba5a72c9c7c 100644 --- a/drivers/of/of_test.c +++ b/drivers/of/of_test.c @@ -2,6 +2,7 @@ /* * KUnit tests for OF APIs */ +#include #include #include @@ -54,8 +55,124 @@ static struct kunit_suite of_dtb_suite = { .init = of_dtb_test_init, }; +struct of_address_resource_bounds_case { + u64 start; + u64 size; + int ret; + + u64 res_start; + u64 res_end; +}; + +static void of_address_resource_bounds_case_desc(const struct of_address_resource_bounds_case *p, + char *name) +{ + snprintf(name, KUNIT_PARAM_DESC_SIZE, "start=0x%016llx,size=0x%016llx", p->start, p->size); +} + +static const struct of_address_resource_bounds_case of_address_resource_bounds_cases[] = { + { + .start = 0, + .size = 0, + .ret = 0, + .res_start = 0, + .res_end = -1, + }, + { + .start = 0, + .size = 0x1000, + .ret = 0, + .res_start = 0, + .res_end = 0xfff, + }, + { + .start = 0x1000, + .size = 0, + .ret = 0, + .res_start = 0x1000, + .res_end = 0xfff, + }, + { + .start = 0x1000, + .size = 0x1000, + .ret = 0, + .res_start = 0x1000, + .res_end = 0x1fff, + }, + { + .start = 1, + .size = RESOURCE_SIZE_MAX, + .ret = 0, + .res_start = 1, + .res_end = RESOURCE_SIZE_MAX, + }, + { + .start = RESOURCE_SIZE_MAX, + .size = 1, + .ret = 0, + .res_start = RESOURCE_SIZE_MAX, + .res_end = RESOURCE_SIZE_MAX, + }, + { + .start = 2, + .size = RESOURCE_SIZE_MAX, + .ret = -EOVERFLOW, + }, + { + .start = RESOURCE_SIZE_MAX, + .size = 2, + .ret = -EOVERFLOW, + }, + { + .start = ULL(0x100000000), + .size = 1, + .ret = sizeof(resource_size_t) > sizeof(u32) ? 0 : -EOVERFLOW, + .res_start = ULL(0x100000000), + .res_end = ULL(0x100000000), + }, + { + .start = 0x1000, + .size = 0xffffffff, + .ret = sizeof(resource_size_t) > sizeof(u32) ? 0 : -EOVERFLOW, + .res_start = 0x1000, + .res_end = ULL(0x100000ffe), + }, +}; + +KUNIT_ARRAY_PARAM(of_address_resource_bounds, + of_address_resource_bounds_cases, of_address_resource_bounds_case_desc); + +static void of_address_resource_bounds(struct kunit *test) +{ + const struct of_address_resource_bounds_case *param = test->param_value; + struct resource r; /* Intentionally uninitialized */ + int ret; + + if (!IS_ENABLED(CONFIG_OF_ADDRESS)) + kunit_skip(test, "CONFIG_OF_ADDRESS not enabled\n"); + + ret = __of_address_resource_bounds(&r, param->start, param->size); + KUNIT_EXPECT_EQ(test, param->ret, ret); + if (ret == 0) { + KUNIT_EXPECT_EQ(test, (resource_size_t)param->res_start, r.start); + KUNIT_EXPECT_EQ(test, (resource_size_t)param->res_end, r.end); + KUNIT_EXPECT_EQ(test, param->size, resource_size(&r)); + } +} + +static struct kunit_case of_address_test_cases[] = { + KUNIT_CASE_PARAM(of_address_resource_bounds, of_address_resource_bounds_gen_params), + {} +}; + +static struct kunit_suite of_address_suite = { + .name = "of_address", + .test_cases = of_address_test_cases, +}; + kunit_test_suites( - &of_dtb_suite, + &of_dtb_suite, &of_address_suite, ); MODULE_DESCRIPTION("KUnit tests for OF APIs"); +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); MODULE_LICENSE("GPL"); -- GitLab From fa803513ab68ba07369643393f1754b845160030 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Fri, 10 Jan 2025 17:19:49 +0800 Subject: [PATCH 119/989] cpufreq/amd-pstate: Fix per-policy boost flag incorrect when fail Commit c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state") sets per-policy boost flag to false when boost fail. However, this boost flag will be set to reverse value in store_local_boost() and cpufreq_boost_trigger_state() in cpufreq.c. This will cause the per-policy boost flag set to true when fail to set boost. Remove the extra assignment in amd_pstate_set_boost() and keep all operations on per-policy boost flag outside of set_boost() to fix this problem. Fixes: c8c68c38b56f ("cpufreq: amd-pstate: initialize core precision boost state") Signed-off-by: Lifeng Zheng Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250110091949.3610770-1-zhenglifeng1@huawei.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index dd9b8d6993d69..7120f035c0be4 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -747,7 +747,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_cpu_boost_update(policy, state); - policy->boost_enabled = !ret ? state : false; refresh_frequency_limits(policy); return ret; -- GitLab From e4d4648eac8b4ef39f412d07715eb26f1ccd7342 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Tue, 28 Jan 2025 00:02:01 +0300 Subject: [PATCH 120/989] platform/x86: ideapad-laptop: pass a correct pointer to the driver data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_platform_profile_register() expects a pointer to the private driver data but instead an address of the pointer variable is passed due to a typo. This leads to the crashes later: BUG: unable to handle page fault for address: 00000000fe0d0044 PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 6 UID: 0 PID: 1284 Comm: tuned Tainted: G W 6.13.0+ #7 Tainted: [W]=WARN Hardware name: LENOVO 21D0/LNVNB161216, BIOS J6CN45WW 03/17/2023 RIP: 0010:__mutex_lock.constprop.0+0x6bf/0x7f0 Call Trace: dytc_profile_set+0x4a/0x140 [ideapad_laptop] _store_and_notify+0x13/0x40 [platform_profile] class_for_each_device+0x145/0x180 platform_profile_store+0xc0/0x130 [platform_profile] kernfs_fop_write_iter+0x13e/0x1f0 vfs_write+0x290/0x450 ksys_write+0x6c/0xe0 do_syscall_64+0x82/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Found by Linux Verification Center (linuxtesting.org). Fixes: 249c576f0f9d ("ACPI: platform_profile: Let drivers set drvdata to the class device") Signed-off-by: Fedor Pchelkin Reviewed-by: Kurt Borja Link: https://lore.kernel.org/r/20250127210202.568691-1-pchelkin@ispras.ru Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index dfb5d4b8c0465..30bd366d7b58a 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1121,7 +1121,7 @@ static int ideapad_dytc_profile_init(struct ideapad_private *priv) /* Create platform_profile structure and register */ priv->dytc->ppdev = devm_platform_profile_register(&priv->platform_device->dev, - "ideapad-laptop", &priv->dytc, + "ideapad-laptop", priv->dytc, &dytc_profile_ops); if (IS_ERR(priv->dytc->ppdev)) { err = PTR_ERR(priv->dytc->ppdev); -- GitLab From 583ef25bb2a094813351a727ddec38b35a15b9f8 Mon Sep 17 00:00:00 2001 From: Dmitry Kandybka Date: Fri, 24 Jan 2025 01:07:39 +0300 Subject: [PATCH 121/989] platform/x86/intel: pmc: fix ltr decode in pmc_core_ltr_show() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In pmc_core_ltr_show(), promote 'val' to 'u64' to avoid possible integer overflow. Values (10 bit) are multiplied by the scale, the result of expression is in a range from 1 to 34,326,183,936 which is bigger then UINT32_MAX. Compile tested only. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Kandybka Reviewed-by: Rajneesh Bhardwaj Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20250123220739.68087-1-d.kandybka@gmail.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/pmc/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 10f04b9441174..1ee0fb5f8250b 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -626,8 +626,8 @@ static u32 convert_ltr_scale(u32 val) static int pmc_core_ltr_show(struct seq_file *s, void *unused) { struct pmc_dev *pmcdev = s->private; - u64 decoded_snoop_ltr, decoded_non_snoop_ltr; - u32 ltr_raw_data, scale, val; + u64 decoded_snoop_ltr, decoded_non_snoop_ltr, val; + u32 ltr_raw_data, scale; u16 snoop_ltr, nonsnoop_ltr; unsigned int i, index, ltr_index = 0; -- GitLab From 5c8f9a05336cf5cadbd57ad461621b386aadb762 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 30 Jan 2025 08:38:49 +0300 Subject: [PATCH 122/989] arm64: dts: rockchip: Fix broken tsadc pinctrl names for rk3588 The tsadc driver does not handle pinctrl "gpio" and "otpout". Let's use the correct pinctrl names "default" and "sleep". Additionally, Alexey Charkov's testing [1] has established that it is necessary for pinctrl state to reference the &tsadc_shut_org configuration rather than &tsadc_shut for the driver to function correctly. [1] https://lkml.org/lkml/2025/1/24/966 Fixes: 32641b8ab1a5 ("arm64: dts: rockchip: add rk3588 thermal sensor") Cc: stable@vger.kernel.org Reviewed-by: Dragan Simic Signed-off-by: Alexander Shiyan Link: https://lore.kernel.org/r/20250130053849.4902-1-eagle.alexander923@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-base.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index 8cfa30837ce72..978de506d4348 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -2668,9 +2668,9 @@ tsadc: tsadc@fec00000 { rockchip,hw-tshut-temp = <120000>; rockchip,hw-tshut-mode = <0>; /* tshut mode 0:CRU 1:GPIO */ rockchip,hw-tshut-polarity = <0>; /* tshut polarity 0:LOW 1:HIGH */ - pinctrl-0 = <&tsadc_gpio_func>; - pinctrl-1 = <&tsadc_shut>; - pinctrl-names = "gpio", "otpout"; + pinctrl-0 = <&tsadc_shut_org>; + pinctrl-1 = <&tsadc_gpio_func>; + pinctrl-names = "default", "sleep"; #thermal-sensor-cells = <1>; status = "disabled"; }; -- GitLab From a6a7cba17c544fb95d5a29ab9d9ed4503029cb29 Mon Sep 17 00:00:00 2001 From: Tianling Shen Date: Sun, 19 Jan 2025 17:11:54 +0800 Subject: [PATCH 123/989] arm64: dts: rockchip: change eth phy mode to rgmii-id for orangepi r1 plus lts In general the delay should be added by the PHY instead of the MAC, and this improves network stability on some boards which seem to need different delay. Fixes: 387b3bbac5ea ("arm64: dts: rockchip: Add Xunlong OrangePi R1 Plus LTS") Cc: stable@vger.kernel.org # 6.6+ Signed-off-by: Tianling Shen Link: https://lore.kernel.org/r/20250119091154.1110762-1-cnsztl@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts | 3 +-- arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts | 1 + arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts index 67c246ad8b8c0..ec2ce894da1fc 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts @@ -17,8 +17,7 @@ / { &gmac2io { phy-handle = <&yt8531c>; - tx_delay = <0x19>; - rx_delay = <0x05>; + phy-mode = "rgmii-id"; status = "okay"; mdio { diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts index 324a8e951f7e4..846b931e16d21 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts @@ -15,6 +15,7 @@ / { &gmac2io { phy-handle = <&rtl8211e>; + phy-mode = "rgmii"; tx_delay = <0x24>; rx_delay = <0x18>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi index 4f193704e5dc2..09508e324a280 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi @@ -109,7 +109,6 @@ &gmac2io { assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>; assigned-clock-parents = <&gmac_clk>, <&gmac_clk>; clock_in_out = "input"; - phy-mode = "rgmii"; phy-supply = <&vcc_io>; pinctrl-0 = <&rgmiim1_pins>; pinctrl-names = "default"; -- GitLab From 4eee627ea59304cdd66c5d4194ef13486a6c44fc Mon Sep 17 00:00:00 2001 From: Lukasz Czechowski Date: Tue, 21 Jan 2025 13:56:03 +0100 Subject: [PATCH 124/989] arm64: dts: rockchip: Move uart5 pin configuration to px30 ringneck SoM In the PX30-uQ7 (Ringneck) SoM, the hardware CTS and RTS pins for uart5 cannot be used for the UART CTS/RTS, because they are already allocated for different purposes. CTS pin is routed to SUS_S3# signal, while RTS pin is used internally and is not available on Q7 connector. Move definition of the pinctrl-0 property from px30-ringneck-haikou.dts to px30-ringneck.dtsi. This commit is a dependency to next commit in the patch series, that disables DMA for uart5. Cc: stable@vger.kernel.org Reviewed-by: Quentin Schulz Signed-off-by: Lukasz Czechowski Link: https://lore.kernel.org/r/20250121125604.3115235-2-lukasz.czechowski@thaumatec.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts | 1 - arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts index e4517f47d519c..eb9470a00e549 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts @@ -226,7 +226,6 @@ &uart0 { }; &uart5 { - pinctrl-0 = <&uart5_xfer>; rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index ae050cc6cd050..2c87005c89bd3 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -396,6 +396,10 @@ &u2phy_host { status = "okay"; }; +&uart5 { + pinctrl-0 = <&uart5_xfer>; +}; + /* Mule UCAN */ &usb_host0_ehci { status = "okay"; -- GitLab From f3be8a9b1afffbcc70f8e41063b151b1038d7813 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Wed, 29 Jan 2025 13:40:07 +0100 Subject: [PATCH 125/989] accel/ivpu: Fix error handling in ivpu_boot() Ensure IRQs and IPC are properly disabled if HW sched or DCT initialization fails. Fixes: cc3c72c7e610 ("accel/ivpu: Refactor failure diagnostics during boot") Cc: stable@vger.kernel.org # v6.13+ Reviewed-by: Karol Wachowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20250129124009.1039982-2-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index ca2bf47ce2484..0c4a82271c26d 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -397,15 +397,19 @@ int ivpu_boot(struct ivpu_device *vdev) if (ivpu_fw_is_cold_boot(vdev)) { ret = ivpu_pm_dct_init(vdev); if (ret) - goto err_diagnose_failure; + goto err_disable_ipc; ret = ivpu_hw_sched_init(vdev); if (ret) - goto err_diagnose_failure; + goto err_disable_ipc; } return 0; +err_disable_ipc: + ivpu_ipc_disable(vdev); + ivpu_hw_irq_disable(vdev); + disable_irq(vdev->irq); err_diagnose_failure: ivpu_hw_diagnose_failure(vdev); ivpu_mmu_evtq_dump(vdev); -- GitLab From f2bc2afe34c107a02ce829a4039e85514feafe55 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Wed, 29 Jan 2025 13:40:08 +0100 Subject: [PATCH 126/989] accel/ivpu: Clear runtime_error after pm_runtime_resume_and_get() fails pm_runtime_resume_and_get() sets dev->power.runtime_error that causes all subsequent pm_runtime_get_sync() calls to fail. Clear the runtime_error using pm_runtime_set_suspended(), so the driver doesn't have to be reloaded to recover when the NPU fails to boot during runtime resume. Fixes: 7d4b4c74432d ("accel/ivpu: Remove suspend_reschedule_counter") Cc: stable@vger.kernel.org # v6.11+ Reviewed-by: Maciej Falkowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20250129124009.1039982-3-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index 949f4233946c6..c3774d2221326 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -309,7 +309,10 @@ int ivpu_rpm_get(struct ivpu_device *vdev) int ret; ret = pm_runtime_resume_and_get(vdev->drm.dev); - drm_WARN_ON(&vdev->drm, ret < 0); + if (ret < 0) { + ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); + pm_runtime_set_suspended(vdev->drm.dev); + } return ret; } -- GitLab From 41a2d8286c905614f29007f1bc8e652d54654b82 Mon Sep 17 00:00:00 2001 From: Jacek Lawrynowicz Date: Wed, 29 Jan 2025 13:40:09 +0100 Subject: [PATCH 127/989] accel/ivpu: Fix error handling in recovery/reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disable runtime PM for the duration of reset/recovery so it is possible to set the correct runtime PM state depending on the outcome of the `ivpu_resume()`. Don’t suspend or reset the HW if the NPU is suspended when the reset/recovery is requested. Also, move common reset/recovery code to separate functions for better code readability. Fixes: 27d19268cf39 ("accel/ivpu: Improve recovery and reset support") Cc: stable@vger.kernel.org # v6.8+ Reviewed-by: Maciej Falkowski Reviewed-by: Jeffrey Hugo Signed-off-by: Jacek Lawrynowicz Link: https://patchwork.freedesktop.org/patch/msgid/20250129124009.1039982-4-jacek.lawrynowicz@linux.intel.com --- drivers/accel/ivpu/ivpu_pm.c | 79 ++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c index c3774d2221326..8b2b050cc41a9 100644 --- a/drivers/accel/ivpu/ivpu_pm.c +++ b/drivers/accel/ivpu/ivpu_pm.c @@ -115,41 +115,57 @@ static int ivpu_resume(struct ivpu_device *vdev) return ret; } -static void ivpu_pm_recovery_work(struct work_struct *work) +static void ivpu_pm_reset_begin(struct ivpu_device *vdev) { - struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); - struct ivpu_device *vdev = pm->vdev; - char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; - int ret; - - ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); - - ret = pm_runtime_resume_and_get(vdev->drm.dev); - if (ret) - ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); - - ivpu_jsm_state_dump(vdev); - ivpu_dev_coredump(vdev); + pm_runtime_disable(vdev->drm.dev); atomic_inc(&vdev->pm->reset_counter); atomic_set(&vdev->pm->reset_pending, 1); down_write(&vdev->pm->reset_lock); +} + +static void ivpu_pm_reset_complete(struct ivpu_device *vdev) +{ + int ret; - ivpu_suspend(vdev); ivpu_pm_prepare_cold_boot(vdev); ivpu_jobs_abort_all(vdev); ivpu_ms_cleanup_all(vdev); ret = ivpu_resume(vdev); - if (ret) + if (ret) { ivpu_err(vdev, "Failed to resume NPU: %d\n", ret); + pm_runtime_set_suspended(vdev->drm.dev); + } else { + pm_runtime_set_active(vdev->drm.dev); + } up_write(&vdev->pm->reset_lock); atomic_set(&vdev->pm->reset_pending, 0); - kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); pm_runtime_mark_last_busy(vdev->drm.dev); - pm_runtime_put_autosuspend(vdev->drm.dev); + pm_runtime_enable(vdev->drm.dev); +} + +static void ivpu_pm_recovery_work(struct work_struct *work) +{ + struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); + struct ivpu_device *vdev = pm->vdev; + char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; + + ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter)); + + ivpu_pm_reset_begin(vdev); + + if (!pm_runtime_status_suspended(vdev->drm.dev)) { + ivpu_jsm_state_dump(vdev); + ivpu_dev_coredump(vdev); + ivpu_suspend(vdev); + } + + ivpu_pm_reset_complete(vdev); + + kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); } void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) @@ -328,16 +344,13 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) struct ivpu_device *vdev = pci_get_drvdata(pdev); ivpu_dbg(vdev, PM, "Pre-reset..\n"); - atomic_inc(&vdev->pm->reset_counter); - atomic_set(&vdev->pm->reset_pending, 1); - pm_runtime_get_sync(vdev->drm.dev); - down_write(&vdev->pm->reset_lock); - ivpu_prepare_for_reset(vdev); - ivpu_hw_reset(vdev); - ivpu_pm_prepare_cold_boot(vdev); - ivpu_jobs_abort_all(vdev); - ivpu_ms_cleanup_all(vdev); + ivpu_pm_reset_begin(vdev); + + if (!pm_runtime_status_suspended(vdev->drm.dev)) { + ivpu_prepare_for_reset(vdev); + ivpu_hw_reset(vdev); + } ivpu_dbg(vdev, PM, "Pre-reset done.\n"); } @@ -345,18 +358,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) void ivpu_pm_reset_done_cb(struct pci_dev *pdev) { struct ivpu_device *vdev = pci_get_drvdata(pdev); - int ret; ivpu_dbg(vdev, PM, "Post-reset..\n"); - ret = ivpu_resume(vdev); - if (ret) - ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); - up_write(&vdev->pm->reset_lock); - atomic_set(&vdev->pm->reset_pending, 0); - ivpu_dbg(vdev, PM, "Post-reset done.\n"); - pm_runtime_mark_last_busy(vdev->drm.dev); - pm_runtime_put_autosuspend(vdev->drm.dev); + ivpu_pm_reset_complete(vdev); + + ivpu_dbg(vdev, PM, "Post-reset done.\n"); } void ivpu_pm_init(struct ivpu_device *vdev) -- GitLab From 5ae4dca718eacd0a56173a687a3736eb7e627c77 Mon Sep 17 00:00:00 2001 From: Lukasz Czechowski Date: Tue, 21 Jan 2025 13:56:04 +0100 Subject: [PATCH 128/989] arm64: dts: rockchip: Disable DMA for uart5 on px30-ringneck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UART controllers without flow control seem to behave unstable in case DMA is enabled. The issues were indicated in the message: https://lore.kernel.org/linux-arm-kernel/CAMdYzYpXtMocCtCpZLU_xuWmOp2Ja_v0Aj0e6YFNRA-yV7u14g@mail.gmail.com/ In case of PX30-uQ7 Ringneck SoM, it was noticed that after couple of hours of UART communication, the CPU stall was occurring, leading to the system becoming unresponsive. After disabling the DMA, extensive UART communication tests for up to two weeks were performed, and no issues were further observed. The flow control pins for uart5 are not available on PX30-uQ7 Ringneck, as configured by pinctrl-0, so the DMA nodes were removed on SoM dtsi. Cc: stable@vger.kernel.org Fixes: c484cf93f61b ("arm64: dts: rockchip: add PX30-µQ7 (Ringneck) SoM with Haikou baseboard") Reviewed-by: Quentin Schulz Signed-off-by: Lukasz Czechowski Link: https://lore.kernel.org/r/20250121125604.3115235-3-lukasz.czechowski@thaumatec.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index 2c87005c89bd3..e80412abec081 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -397,6 +397,8 @@ &u2phy_host { }; &uart5 { + /delete-property/ dmas; + /delete-property/ dma-names; pinctrl-0 = <&uart5_xfer>; }; -- GitLab From 2f9eb5262e63396a315c7da34a6c80c5d335df9f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 16 Jan 2025 15:36:31 +0100 Subject: [PATCH 129/989] arm64: dts: rockchip: fix fixed-regulator renames on rk3399-gru devices rk3399-gru chromebooks have a regulator chains where one named regulator supplies multiple regulators pp900-usb pp900_pcie that supply the named peripherals. The dtsi used somewhat creative structure to describe that in creating the base node 3 times with different phandles and describing the EC dependency in a comment. This didn't register in the recent regulator-node renaming, as the additional nodes were empty, so adapt the missing node names for now. Fixes: 5c96e6330197 ("arm64: dts: rockchip: adapt regulator nodenames to preferred form") Tested-by: Vicente Bergas Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250116143631.3650469-1-heiko@sntech.de --- .../dts/rockchip/rk3399-gru-chromebook.dtsi | 8 +++---- .../boot/dts/rockchip/rk3399-gru-scarlet.dtsi | 6 ++--- arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 22 +++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi index 988e6ca32fac9..a9ea4b0daa04c 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi @@ -22,11 +22,11 @@ pp900_ap: regulator-pp900-ap { }; /* EC turns on w/ pp900_usb_en */ - pp900_usb: pp900-ap { + pp900_usb: regulator-pp900-ap { }; /* EC turns on w/ pp900_pcie_en */ - pp900_pcie: pp900-ap { + pp900_pcie: regulator-pp900-ap { }; pp3000: regulator-pp3000 { @@ -126,7 +126,7 @@ pp1800_pcie: regulator-pp1800-pcie { }; /* Always on; plain and simple */ - pp3000_ap: pp3000_emmc: pp3000 { + pp3000_ap: pp3000_emmc: regulator-pp3000 { }; pp1500_ap_io: regulator-pp1500-ap-io { @@ -160,7 +160,7 @@ pp3300_disp: regulator-pp3300-disp { }; /* EC turns on w/ pp3300_usb_en_l */ - pp3300_usb: pp3300 { + pp3300_usb: regulator-pp3300 { }; /* gpio is shared with pp1800_pcie and pinctrl is set there */ diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index 19b23b4389658..5e068377a0a28 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -92,7 +92,7 @@ pp900_s3: regulator-pp900-s3 { }; /* EC turns on pp1800_s3_en */ - pp1800_s3: pp1800 { + pp1800_s3: regulator-pp1800 { }; /* pp3300 children, sorted by name */ @@ -109,11 +109,11 @@ pp2800_cam: regulator-pp2800-avdd { }; /* EC turns on pp3300_s0_en */ - pp3300_s0: pp3300 { + pp3300_s0: regulator-pp3300 { }; /* EC turns on pp3300_s3_en */ - pp3300_s3: pp3300 { + pp3300_s3: regulator-pp3300 { }; /* diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 6d9e60b01225e..7eca1da78cffa 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -189,39 +189,39 @@ ppvar_gpu: ppvar-gpu { }; /* EC turns on w/ pp900_ddrpll_en */ - pp900_ddrpll: pp900-ap { + pp900_ddrpll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pll_en */ - pp900_pll: pp900-ap { + pp900_pll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pmu_en */ - pp900_pmu: pp900-ap { + pp900_pmu: regulator-pp900-ap { }; /* EC turns on w/ pp1800_s0_en_l */ - pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: pp1800 { + pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: regulator-pp1800 { }; /* EC turns on w/ pp1800_avdd_en_l */ - pp1800_avdd: pp1800 { + pp1800_avdd: regulator-pp1800 { }; /* EC turns on w/ pp1800_lid_en_l */ - pp1800_lid: pp1800_mic: pp1800 { + pp1800_lid: pp1800_mic: regulator-pp1800 { }; /* EC turns on w/ lpddr_pwr_en */ - pp1800_lpddr: pp1800 { + pp1800_lpddr: regulator-pp1800 { }; /* EC turns on w/ pp1800_pmu_en_l */ - pp1800_pmu: pp1800 { + pp1800_pmu: regulator-pp1800 { }; /* EC turns on w/ pp1800_usb_en_l */ - pp1800_usb: pp1800 { + pp1800_usb: regulator-pp1800 { }; pp3000_sd_slot: regulator-pp3000-sd-slot { @@ -259,11 +259,11 @@ ppvar_sd_card_io: ppvar-sd-card-io { }; /* EC turns on w/ pp3300_trackpad_en_l */ - pp3300_trackpad: pp3300-trackpad { + pp3300_trackpad: regulator-pp3300-trackpad { }; /* EC turns on w/ usb_a_en */ - pp5000_usb_a_vbus: pp5000 { + pp5000_usb_a_vbus: regulator-pp5000 { }; ap_rtc_clk: ap-rtc-clk { -- GitLab From a1d939055a22be06d8c12bf53afb258b9d38575f Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Mon, 13 Jan 2025 18:47:34 +0800 Subject: [PATCH 130/989] arm64: dts: rockchip: Fix lcdpwr_en pin for Cool Pi GenBook According to the schematic, the lcdpwr_en pin is GPIO0_C4, not GPIO1_C4. Fixes: 4a8c1161b843 ("arm64: dts: rockchip: Add support for rk3588 based Cool Pi CM5 GenBook") Signed-off-by: Andy Yan Link: https://lore.kernel.org/r/20250113104825.2390427-1-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts index 92f0ed83c9902..bc6b43a771537 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts @@ -113,7 +113,7 @@ vcc3v3_lcd: regulator-vcc3v3-lcd { compatible = "regulator-fixed"; regulator-name = "vcc3v3_lcd"; enable-active-high; - gpio = <&gpio1 RK_PC4 GPIO_ACTIVE_HIGH>; + gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&lcdpwr_en>; vin-supply = <&vcc3v3_sys>; @@ -241,7 +241,7 @@ &pcie3x4 { &pinctrl { lcd { lcdpwr_en: lcdpwr-en { - rockchip,pins = <1 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; + rockchip,pins = <0 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; }; bl_en: bl-en { -- GitLab From 48e487b002891eb0aeaec704c9bed51f028deff1 Mon Sep 17 00:00:00 2001 From: Stuart Hayhurst Date: Tue, 21 Jan 2025 20:00:07 +0000 Subject: [PATCH 131/989] HID: corsair-void: Add missing delayed work cancel for headset status The cancel_delayed_work_sync() call was missed, causing a use-after-free in corsair_void_remove(). Reported-by: yan kang Reported-by: yue sun Closes: https://lore.kernel.org/all/SY8P300MB042106286A2536707D2FB736A1E42@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Closes: https://lore.kernel.org/all/SY8P300MB0421872E0AE934C9616FA61EA1E42@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Fixes: 6ea2a6fd3872 ("HID: corsair-void: Add Corsair Void headset family driver") Cc: stable@vger.kernel.org Signed-off-by: Stuart Hayhurst Signed-off-by: Jiri Kosina --- drivers/hid/hid-corsair-void.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index 6ece56b850fc0..bd8f3d849b58d 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -726,6 +726,7 @@ static void corsair_void_remove(struct hid_device *hid_dev) if (drvdata->battery) power_supply_unregister(drvdata->battery); + cancel_delayed_work_sync(&drvdata->delayed_status_work); cancel_delayed_work_sync(&drvdata->delayed_firmware_work); sysfs_remove_group(&hid_dev->dev.kobj, &corsair_void_attr_group); } -- GitLab From c098363828f7006ef5c5121b673bc5e26571e6c8 Mon Sep 17 00:00:00 2001 From: Stuart Hayhurst Date: Tue, 21 Jan 2025 20:00:08 +0000 Subject: [PATCH 132/989] HID: corsair-void: Initialise memory for psy_cfg power_supply_config psy_cfg was missing its initialiser, add it in. Fixes: 6ea2a6fd3872 ("HID: corsair-void: Add Corsair Void headset family driver") Cc: stable@vger.kernel.org Signed-off-by: Stuart Hayhurst Signed-off-by: Jiri Kosina --- drivers/hid/hid-corsair-void.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c index bd8f3d849b58d..56e858066c3c3 100644 --- a/drivers/hid/hid-corsair-void.c +++ b/drivers/hid/hid-corsair-void.c @@ -553,7 +553,7 @@ static void corsair_void_battery_remove_work_handler(struct work_struct *work) static void corsair_void_battery_add_work_handler(struct work_struct *work) { struct corsair_void_drvdata *drvdata; - struct power_supply_config psy_cfg; + struct power_supply_config psy_cfg = {}; struct power_supply *new_supply; drvdata = container_of(work, struct corsair_void_drvdata, -- GitLab From 4b54ae69197b9f416baa0fceadff7e89075f8454 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Wed, 22 Jan 2025 09:29:00 +0800 Subject: [PATCH 133/989] HID: intel-ish-hid: fix the length of MNG_SYNC_FW_CLOCK in doorbell The timestamps in the Firmware log and HID sensor samples are incorrect. They show 1970-01-01 because the current IPC driver only uses the first 8 bytes of bootup time when synchronizing time with the firmware. The firmware converts the bootup time to UTC time, which results in the display of 1970-01-01. In write_ipc_from_queue(), when sending the MNG_SYNC_FW_CLOCK message, the clock is updated according to the definition of ipc_time_update_msg. However, in _ish_sync_fw_clock(), the message length is specified as the size of uint64_t when building the doorbell. As a result, the firmware only receives the first 8 bytes of struct ipc_time_update_msg. This patch corrects the length in the doorbell to ensure the entire ipc_time_update_msg is sent, fixing the timestamp issue. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/ipc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 3cd53fc80634a..cb956a8c386cb 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -578,14 +578,14 @@ static void fw_reset_work_fn(struct work_struct *work) static void _ish_sync_fw_clock(struct ishtp_device *dev) { static unsigned long prev_sync; - uint64_t usec; + struct ipc_time_update_msg time = {}; if (prev_sync && time_before(jiffies, prev_sync + 20 * HZ)) return; prev_sync = jiffies; - usec = ktime_to_us(ktime_get_boottime()); - ipc_send_mng_msg(dev, MNG_SYNC_FW_CLOCK, &usec, sizeof(uint64_t)); + /* The fields of time would be updated while sending message */ + ipc_send_mng_msg(dev, MNG_SYNC_FW_CLOCK, &time, sizeof(time)); } /** -- GitLab From 7e0d1cff12b895f44f4ddc8cf50311bc1f775201 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Wed, 22 Jan 2025 09:29:01 +0800 Subject: [PATCH 134/989] HID: intel-ish-hid: Send clock sync message immediately after reset The ISH driver performs a clock sync with the firmware once at system startup and then every 20 seconds. If a firmware reset occurs right after a clock sync, the driver would wait 20 seconds before performing another clock sync with the firmware. This is particularly problematic with the introduction of the "load firmware from host" feature, where the driver performs a clock sync with the bootloader and then has to wait 20 seconds before syncing with the main firmware. This patch clears prev_sync immediately upon receiving an IPC reset, so that the main firmware and driver will perform a clock sync immediately after completing the IPC handshake. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/ipc.c | 9 ++++++--- drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index cb956a8c386cb..4c861119e97aa 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -517,6 +517,10 @@ static int ish_fw_reset_handler(struct ishtp_device *dev) /* ISH FW is dead */ if (!ish_is_input_ready(dev)) return -EPIPE; + + /* Send clock sync at once after reset */ + ishtp_dev->prev_sync = 0; + /* * Set HOST2ISH.ILUP. Apparently we need this BEFORE sending * RESET_NOTIFY_ACK - FW will be checking for it @@ -577,13 +581,12 @@ static void fw_reset_work_fn(struct work_struct *work) */ static void _ish_sync_fw_clock(struct ishtp_device *dev) { - static unsigned long prev_sync; struct ipc_time_update_msg time = {}; - if (prev_sync && time_before(jiffies, prev_sync + 20 * HZ)) + if (dev->prev_sync && time_before(jiffies, dev->prev_sync + 20 * HZ)) return; - prev_sync = jiffies; + dev->prev_sync = jiffies; /* The fields of time would be updated while sending message */ ipc_send_mng_msg(dev, MNG_SYNC_FW_CLOCK, &time, sizeof(time)); } diff --git a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h index 44eddc411e97c..ec9f6e87aaf23 100644 --- a/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h +++ b/drivers/hid/intel-ish-hid/ishtp/ishtp-dev.h @@ -253,6 +253,8 @@ struct ishtp_device { unsigned int ipc_tx_cnt; unsigned long long ipc_tx_bytes_cnt; + /* Time of the last clock sync */ + unsigned long prev_sync; const struct ishtp_hw_ops *ops; size_t mtu; uint32_t ishtp_msg_hdr; -- GitLab From 52572cde8b4a44676557ccb67b035291833112c5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Jan 2025 07:50:57 +0100 Subject: [PATCH 135/989] HID: lenovo: select CONFIG_ACPI_PLATFORM_PROFILE A previous patch tried to fix this link failure: x86_64-linux-ld: drivers/hid/hid-lenovo.o: in function `lenovo_raw_event': hid-lenovo.c:(.text+0x22c): undefined reference to `platform_profile_cycle' but got it wrong in three ways: - the link failure still exists with CONFIG_ACPI_PLATFORM_PROFILE=m when hid-lenovo is built-in - There is no way to manually enable CONFIG_ACPI_PLATFORM_PROFILE, as it is intended to be selected by its users. Remove the broken #if check again and instead select the symbol like the other users do. This requires adding a dependency on CONFIG_ACPI. Fixes: 52e7d1f7c2fd ("HID: lenovo: Fix undefined platform_profile_cycle in ThinkPad X12 keyboard patch") Signed-off-by: Arnd Bergmann Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 2 ++ drivers/hid/hid-lenovo.c | 7 +------ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index b53eb569bd495..8adb745c5b28c 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -570,6 +570,8 @@ config HID_LED config HID_LENOVO tristate "Lenovo / Thinkpad devices" + depends on ACPI + select ACPI_PLATFORM_PROFILE select NEW_LEDS select LEDS_CLASS help diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 4d00bc4d656e6..a7d9ca02779ea 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -32,9 +32,7 @@ #include #include -#if IS_ENABLED(CONFIG_ACPI_PLATFORM_PROFILE) #include -#endif /* CONFIG_ACPI_PLATFORM_PROFILE */ #include "hid-ids.h" @@ -730,13 +728,10 @@ static int lenovo_raw_event_TP_X12_tab(struct hid_device *hdev, u32 raw_data) if (hdev->product == USB_DEVICE_ID_LENOVO_X12_TAB) { report_key_event(input, KEY_RFKILL); return 1; - } -#if IS_ENABLED(CONFIG_ACPI_PLATFORM_PROFILE) - else { + } else { platform_profile_cycle(); return 1; } -#endif /* CONFIG_ACPI_PLATFORM_PROFILE */ return 0; case TP_X12_RAW_HOTKEY_FN_F10: /* TAB1 has PICKUP Phone and TAB2 use Snipping tool*/ -- GitLab From a5a056c8d2ba60017dffb914bdf92c5562defc48 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Jan 2025 14:48:12 +0100 Subject: [PATCH 136/989] HID: intel-thc: fix CONFIG_HID dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In drivers/hid/, most drivers depend on CONFIG_HID, while a couple of the drivers in subdirectories instead depend on CONFIG_HID_SUPPORT and use 'select HID'. With the newly added INTEL_THC_HID, this causes a build warning for a circular dependency: WARNING: unmet direct dependencies detected for HID Depends on [m]: HID_SUPPORT [=y] && INPUT [=m] Selected by [y]: - INTEL_THC_HID [=y] && HID_SUPPORT [=y] && X86_64 [=y] && PCI [=y] && ACPI [=y] WARNING: unmet direct dependencies detected for INPUT_FF_MEMLESS Depends on [m]: INPUT [=m] Selected by [y]: - HID_MICROSOFT [=y] && HID_SUPPORT [=y] && HID [=y] - GREENASIA_FF [=y] && HID_SUPPORT [=y] && HID [=y] && HID_GREENASIA [=y] - HID_WIIMOTE [=y] && HID_SUPPORT [=y] && HID [=y] && LEDS_CLASS [=y] - ZEROPLUS_FF [=y] && HID_SUPPORT [=y] && HID [=y] && HID_ZEROPLUS [=y] Selected by [m]: - HID_ACRUX_FF [=y] && HID_SUPPORT [=y] && HID [=y] && HID_ACRUX [=m] - HID_EMS_FF [=m] && HID_SUPPORT [=y] && HID [=y] - HID_GOOGLE_STADIA_FF [=m] && HID_SUPPORT [=y] && HID [=y] - PANTHERLORD_FF [=y] && HID_SUPPORT [=y] && HID [=y] && HID_PANTHERLORD [=m] It's better to be consistent and always use 'depends on HID' for HID drivers. The notable exception here is USB_KBD/USB_MOUSE, which are alternative implementations that do not depend on the HID subsystem. Do this by extending the "if HID" section below, which means that a few of the duplicate "depends on HID" and "depends on INPUT" statements can be removed in the process. Fixes: 1b2d05384c29 ("HID: intel-thc-hid: Add basic THC driver skeleton") Signed-off-by: Arnd Bergmann Reviewed-by: Ilpo Järvinen Reviewed-by: Maximilian Luz Reviewed-by: Even Xu Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 10 ++++++---- drivers/hid/amd-sfh-hid/Kconfig | 1 - drivers/hid/i2c-hid/Kconfig | 2 +- drivers/hid/intel-ish-hid/Kconfig | 1 - drivers/hid/intel-thc-hid/Kconfig | 1 - drivers/hid/surface-hid/Kconfig | 2 -- drivers/hid/usbhid/Kconfig | 3 +-- net/bluetooth/hidp/Kconfig | 3 +-- 8 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 8adb745c5b28c..ed657ef7281c8 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1376,10 +1376,6 @@ endmenu source "drivers/hid/bpf/Kconfig" -endif # HID - -source "drivers/hid/usbhid/Kconfig" - source "drivers/hid/i2c-hid/Kconfig" source "drivers/hid/intel-ish-hid/Kconfig" @@ -1390,4 +1386,10 @@ source "drivers/hid/surface-hid/Kconfig" source "drivers/hid/intel-thc-hid/Kconfig" +endif # HID + +# USB support may be used with HID disabled + +source "drivers/hid/usbhid/Kconfig" + endif # HID_SUPPORT diff --git a/drivers/hid/amd-sfh-hid/Kconfig b/drivers/hid/amd-sfh-hid/Kconfig index 329de5e12c1a0..3291786a5ee6a 100644 --- a/drivers/hid/amd-sfh-hid/Kconfig +++ b/drivers/hid/amd-sfh-hid/Kconfig @@ -5,7 +5,6 @@ menu "AMD SFH HID Support" config AMD_SFH_HID tristate "AMD Sensor Fusion Hub" - depends on HID depends on X86 help If you say yes to this option, support will be included for the diff --git a/drivers/hid/i2c-hid/Kconfig b/drivers/hid/i2c-hid/Kconfig index ef7c595c9403c..e8d51f410cc12 100644 --- a/drivers/hid/i2c-hid/Kconfig +++ b/drivers/hid/i2c-hid/Kconfig @@ -2,7 +2,7 @@ menuconfig I2C_HID tristate "I2C HID support" default y - depends on I2C && INPUT && HID + depends on I2C if I2C_HID diff --git a/drivers/hid/intel-ish-hid/Kconfig b/drivers/hid/intel-ish-hid/Kconfig index 253dc10d35ef2..568c8688784e7 100644 --- a/drivers/hid/intel-ish-hid/Kconfig +++ b/drivers/hid/intel-ish-hid/Kconfig @@ -6,7 +6,6 @@ config INTEL_ISH_HID tristate "Intel Integrated Sensor Hub" default n depends on X86 - depends on HID help The Integrated Sensor Hub (ISH) enables the ability to offload sensor polling and algorithm processing to a dedicated low power diff --git a/drivers/hid/intel-thc-hid/Kconfig b/drivers/hid/intel-thc-hid/Kconfig index 91ec84902db8f..0351d11376072 100644 --- a/drivers/hid/intel-thc-hid/Kconfig +++ b/drivers/hid/intel-thc-hid/Kconfig @@ -7,7 +7,6 @@ menu "Intel THC HID Support" config INTEL_THC_HID tristate "Intel Touch Host Controller" depends on ACPI - select HID help THC (Touch Host Controller) is the name of the IP block in PCH that interfaces with Touch Devices (ex: touchscreen, touchpad etc.). It diff --git a/drivers/hid/surface-hid/Kconfig b/drivers/hid/surface-hid/Kconfig index 7ce9b5d641eb7..d0cfd0d299263 100644 --- a/drivers/hid/surface-hid/Kconfig +++ b/drivers/hid/surface-hid/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0+ menu "Surface System Aggregator Module HID support" depends on SURFACE_AGGREGATOR - depends on INPUT config SURFACE_HID tristate "HID transport driver for Surface System Aggregator Module" @@ -39,4 +38,3 @@ endmenu config SURFACE_HID_CORE tristate - select HID diff --git a/drivers/hid/usbhid/Kconfig b/drivers/hid/usbhid/Kconfig index 7c2032f7f44de..f3194767a45eb 100644 --- a/drivers/hid/usbhid/Kconfig +++ b/drivers/hid/usbhid/Kconfig @@ -5,8 +5,7 @@ menu "USB HID support" config USB_HID tristate "USB HID transport layer" default y - depends on USB && INPUT - select HID + depends on HID help Say Y here if you want to connect USB keyboards, mice, joysticks, graphic tablets, or any other HID based devices diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig index 6746be07e2220..e08aae35351a7 100644 --- a/net/bluetooth/hidp/Kconfig +++ b/net/bluetooth/hidp/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config BT_HIDP tristate "HIDP protocol support" - depends on BT_BREDR && INPUT && HID_SUPPORT - select HID + depends on BT_BREDR && HID help HIDP (Human Interface Device Protocol) is a transport layer for HID reports. HIDP is required for the Bluetooth Human -- GitLab From e0efe83ed325277bb70f9435d4d9fc70bebdcca8 Mon Sep 17 00:00:00 2001 From: Lenny Szubowicz Date: Thu, 30 Jan 2025 16:57:54 -0500 Subject: [PATCH 137/989] tg3: Disable tg3 PCIe AER on system reboot Disable PCIe AER on the tg3 device on system reboot on a limited list of Dell PowerEdge systems. This prevents a fatal PCIe AER event on the tg3 device during the ACPI _PTS (prepare to sleep) method for S5 on those systems. The _PTS is invoked by acpi_enter_sleep_state_prep() as part of the kernel's reboot sequence as a result of commit 38f34dba806a ("PM: ACPI: reboot: Reinstate S5 for reboot"). There was an earlier fix for this problem by commit 2ca1c94ce0b6 ("tg3: Disable tg3 device on system reboot to avoid triggering AER"). But it was discovered that this earlier fix caused a reboot hang when some Dell PowerEdge servers were booted via ipxe. To address this reboot hang, the earlier fix was essentially reverted by commit 9fc3bc764334 ("tg3: power down device only on SYSTEM_POWER_OFF"). This re-exposed the tg3 PCIe AER on reboot problem. This fix is not an ideal solution because the root cause of the AER is in system firmware. Instead, it's a targeted work-around in the tg3 driver. Note also that the PCIe AER must be disabled on the tg3 device even if the system is configured to use "firmware first" error handling. V3: - Fix sparse warning on improper comparison of pdev->current_state - Adhere to netdev comment style Fixes: 9fc3bc764334 ("tg3: power down device only on SYSTEM_POWER_OFF") Signed-off-by: Lenny Szubowicz Reviewed-by: Pavan Chebbi Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 1c94bf1db7186..d9d675f1ebfe9 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -18212,6 +18213,50 @@ static int tg3_resume(struct device *device) static SIMPLE_DEV_PM_OPS(tg3_pm_ops, tg3_suspend, tg3_resume); +/* Systems where ACPI _PTS (Prepare To Sleep) S5 will result in a fatal + * PCIe AER event on the tg3 device if the tg3 device is not, or cannot + * be, powered down. + */ +static const struct dmi_system_id tg3_restart_aer_quirk_table[] = { + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R440"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R540"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R640"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R650"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R740"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R750"), + }, + }, + {} +}; + static void tg3_shutdown(struct pci_dev *pdev) { struct net_device *dev = pci_get_drvdata(pdev); @@ -18228,6 +18273,19 @@ static void tg3_shutdown(struct pci_dev *pdev) if (system_state == SYSTEM_POWER_OFF) tg3_power_down(tp); + else if (system_state == SYSTEM_RESTART && + dmi_first_match(tg3_restart_aer_quirk_table) && + pdev->current_state != PCI_D3cold && + pdev->current_state != PCI_UNKNOWN) { + /* Disable PCIe AER on the tg3 to avoid a fatal + * error during this system restart. + */ + pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_CERE | + PCI_EXP_DEVCTL_NFERE | + PCI_EXP_DEVCTL_FERE | + PCI_EXP_DEVCTL_URRE); + } rtnl_unlock(); -- GitLab From 235174b2bed88501fda689c113c55737f99332d8 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Fri, 31 Jan 2025 00:31:39 -0800 Subject: [PATCH 138/989] udp: gso: do not drop small packets when PMTU reduces Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO for small packets. But the kernel currently dismisses GSO requests only after checking MTU/PMTU on gso_size. This means any packets, regardless of their payload sizes, could be dropped when PMTU becomes smaller than requested gso_size. We encountered this issue in production and it caused a reliability problem that new QUIC connection cannot be established before PMTU cache expired, while non GSO sockets still worked fine at the same time. Ideally, do not check any GSO related constraints when payload size is smaller than requested gso_size, and return EMSGSIZE instead of EINVAL on MTU/PMTU check failure to be more specific on the error cause. Fixes: 4094871db1d6 ("udp: only do GSO if # of segs > 1") Signed-off-by: Yan Zhai Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 4 ++-- tools/testing/selftests/net/udpgso.c | 26 ++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c472c9a57cf68..a9bb9ce5438ea 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1141,9 +1141,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6671daa67f4fa..c6ea438b5c758 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1389,9 +1389,9 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 3f2fca02fec53..36ff28af4b190 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -102,6 +102,19 @@ struct testcase testcases_v4[] = { .gso_len = CONST_MSS_V4, .r_num_mss = 1, }, + { + /* datalen <= MSS < gso_len: will fall back to no GSO */ + .tlen = CONST_MSS_V4, + .gso_len = CONST_MSS_V4 + 1, + .r_num_mss = 0, + .r_len_last = CONST_MSS_V4, + }, + { + /* MSS < datalen < gso_len: fail */ + .tlen = CONST_MSS_V4 + 1, + .gso_len = CONST_MSS_V4 + 2, + .tfail = true, + }, { /* send a single MSS + 1B */ .tlen = CONST_MSS_V4 + 1, @@ -205,6 +218,19 @@ struct testcase testcases_v6[] = { .gso_len = CONST_MSS_V6, .r_num_mss = 1, }, + { + /* datalen <= MSS < gso_len: will fall back to no GSO */ + .tlen = CONST_MSS_V6, + .gso_len = CONST_MSS_V6 + 1, + .r_num_mss = 0, + .r_len_last = CONST_MSS_V6, + }, + { + /* MSS < datalen < gso_len: fail */ + .tlen = CONST_MSS_V6 + 1, + .gso_len = CONST_MSS_V6 + 2, + .tfail = true + }, { /* send a single MSS + 1B */ .tlen = CONST_MSS_V6 + 1, -- GitLab From 363236d709e75610b628c2a4337ccbe42e454b6d Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 15 Jan 2025 15:00:20 +0800 Subject: [PATCH 139/989] HID: ignore non-functional sensor in HP 5MP Camera The HP 5MP Camera (USB ID 0408:5473) reports a HID sensor interface that is not actually implemented. Attempting to access this non-functional sensor via iio_info causes system hangs as runtime PM tries to wake up an unresponsive sensor. [453] hid-sensor-hub 0003:0408:5473.0003: Report latency attributes: ffffffff:ffffffff [453] hid-sensor-hub 0003:0408:5473.0003: common attributes: 5:1, 2:1, 3:1 ffffffff:ffffffff Add this device to the HID ignore list since the sensor interface is non-functional by design and should not be exposed to userspace. Signed-off-by: Chia-Lin Kao (AceLan) Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index c448de53bf91e..7debfe0c5cb98 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1095,6 +1095,7 @@ #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3001 0x3001 #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3003 0x3003 #define USB_DEVICE_ID_QUANTA_OPTICAL_TOUCH_3008 0x3008 +#define USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473 0x5473 #define I2C_VENDOR_ID_RAYDIUM 0x2386 #define I2C_PRODUCT_ID_RAYDIUM_4B33 0x4b33 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index e0bbf0c6345d6..5d7a418ccdbec 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -891,6 +891,7 @@ static const struct hid_device_id hid_ignore_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_DPAD) }, #endif { HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) }, + { HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) }, { } }; -- GitLab From 05c4ede6951b5d8e083b6bb237950cac59bdeb92 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Wed, 15 Jan 2025 17:28:16 -0800 Subject: [PATCH 140/989] HID: hid-steam: Fix issues with disabling both gamepad mode and lizard mode When lizard mode is disabled, there were two issues: 1. Switching between gamepad mode and desktop mode still functioned, even though desktop mode did not. This lead to the ability to "break" gamepad mode by holding down the Options key even while lizard mode is disabled 2. If you were in desktop mode when lizard mode is disabled, you would immediately enter this faulty mode. This patch properly disables the ability to switch between gamepad mode and the faulty desktop mode by holding the Options key, as well as effectively removing the faulty mode by bypassing the early returns if lizard mode is disabled. Reported-by: Eugeny Shcheglov Signed-off-by: Vicki Pfau Signed-off-by: Jiri Kosina --- drivers/hid/hid-steam.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index af38fc8eb34fd..b008fd0834b94 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -1050,10 +1050,10 @@ static void steam_mode_switch_cb(struct work_struct *work) struct steam_device, mode_switch); unsigned long flags; bool client_opened; - steam->gamepad_mode = !steam->gamepad_mode; if (!lizard_mode) return; + steam->gamepad_mode = !steam->gamepad_mode; if (steam->gamepad_mode) steam_set_lizard_mode(steam, false); else { @@ -1599,7 +1599,7 @@ static void steam_do_deck_input_event(struct steam_device *steam, schedule_delayed_work(&steam->mode_switch, 45 * HZ / 100); } - if (!steam->gamepad_mode) + if (!steam->gamepad_mode && lizard_mode) return; lpad_touched = b10 & BIT(3); @@ -1669,7 +1669,7 @@ static void steam_do_deck_sensors_event(struct steam_device *steam, */ steam->sensor_timestamp_us += 4000; - if (!steam->gamepad_mode) + if (!steam->gamepad_mode && lizard_mode) return; input_event(sensors, EV_MSC, MSC_TIMESTAMP, steam->sensor_timestamp_us); -- GitLab From 3fb3cb4350befc4f901c54e0cb4a2a47b1302e08 Mon Sep 17 00:00:00 2001 From: Andrey Vatoropin Date: Thu, 30 Jan 2025 09:00:34 +0000 Subject: [PATCH 141/989] power: supply: da9150-fg: fix potential overflow Size of variable sd_gain equals four bytes - DA9150_QIF_SD_GAIN_SIZE. Size of variable shunt_val equals two bytes - DA9150_QIF_SHUNT_VAL_SIZE. The expression sd_gain * shunt_val is currently being evaluated using 32-bit arithmetic. So during the multiplication an overflow may occur. As the value of type 'u64' is used as storage for the eventual result, put ULL variable at the first position of each expression in order to give the compiler complete information about the proper arithmetic to use. According to C99 the guaranteed width for a variable of type 'unsigned long long' >= 64 bits. Remove the explicit cast to u64 as it is meaningless. Just for the sake of consistency, perform the similar trick with another expression concerning 'iavg'. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: a419b4fd9138 ("power: Add support for DA9150 Fuel-Gauge") Signed-off-by: Andrey Vatoropin Link: https://lore.kernel.org/r/20250130090030.53422-1-a.vatoropin@crpt.ru Signed-off-by: Sebastian Reichel --- drivers/power/supply/da9150-fg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/da9150-fg.c b/drivers/power/supply/da9150-fg.c index 652c1f213af1c..4f28ef1bba1a3 100644 --- a/drivers/power/supply/da9150-fg.c +++ b/drivers/power/supply/da9150-fg.c @@ -247,9 +247,9 @@ static int da9150_fg_current_avg(struct da9150_fg *fg, DA9150_QIF_SD_GAIN_SIZE); da9150_fg_read_sync_end(fg); - div = (u64) (sd_gain * shunt_val * 65536ULL); + div = 65536ULL * sd_gain * shunt_val; do_div(div, 1000000); - res = (u64) (iavg * 1000000ULL); + res = 1000000ULL * iavg; do_div(res, div); val->intval = (int) res; -- GitLab From 64dd6edfc421479e416301c48b79cece8d0351fc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 30 Jan 2025 15:00:35 +0100 Subject: [PATCH 142/989] power: supply: core: Fix extension related lockdep warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6037802bbae8 ("power: supply: core: implement extension API") there is the following ABBA deadlock (simplified) between the LED trigger code and the power-supply code: 1) When registering a power-supply class device, power_supply_register() calls led_trigger_register() from power_supply_create_triggers() in a scoped_guard(rwsem_read, &psy->extensions_sem) context. led_trigger_register() then in turn takes a LED subsystem lock. So here we have the following locking order: * Read-lock extensions_sem * Lock LED subsystem lock(s) 2) When registering a LED class device, with its default trigger set to a power-supply LED trigger (which has already been registered) The LED class code calls power_supply_led_trigger_activate() when setting up the default trigger. power_supply_led_trigger_activate() calls power_supply_get_property() to determine the initial value of to assign to the LED and that read-locks extensions_sem. So now we have the following locking order: * Lock LED subsystem lock(s) * Read-lock extensions_sem Fixing this is easy, there is no need to hold the extensions_sem when calling power_supply_create_triggers() since all triggers are always created rather then checking for the presence of certain attributes as power_supply_add_hwmon_sysfs() does. Move power_supply_create_triggers() out of the guard block to fix this. Here is the lockdep report fixed by this change: [ 31.249343] ====================================================== [ 31.249378] WARNING: possible circular locking dependency detected [ 31.249413] 6.13.0-rc6+ #251 Tainted: G C E [ 31.249440] ------------------------------------------------------ [ 31.249471] (udev-worker)/553 is trying to acquire lock: [ 31.249501] ffff892adbcaf660 (&psy->extensions_sem){.+.+}-{4:4}, at: power_supply_get_property.part.0+0x22/0x150 [ 31.249574] but task is already holding lock: [ 31.249603] ffff892adbc0bad0 (&led_cdev->trigger_lock){+.+.}-{4:4}, at: led_trigger_set_default+0x34/0xe0 [ 31.249657] which lock already depends on the new lock. [ 31.249696] the existing dependency chain (in reverse order) is: [ 31.249735] -> #2 (&led_cdev->trigger_lock){+.+.}-{4:4}: [ 31.249778] down_write+0x3b/0xd0 [ 31.249803] led_trigger_set_default+0x34/0xe0 [ 31.249833] led_classdev_register_ext+0x311/0x3a0 [ 31.249863] input_leds_connect+0x1dc/0x2a0 [ 31.249889] input_attach_handler.isra.0+0x75/0x90 [ 31.249921] input_register_device.cold+0xa1/0x150 [ 31.249955] hidinput_connect+0x8a2/0xb80 [ 31.249982] hid_connect+0x582/0x5c0 [ 31.250007] hid_hw_start+0x3f/0x60 [ 31.250030] hid_device_probe+0x122/0x1f0 [ 31.250053] really_probe+0xde/0x340 [ 31.250080] __driver_probe_device+0x78/0x110 [ 31.250105] driver_probe_device+0x1f/0xa0 [ 31.250132] __device_attach_driver+0x85/0x110 [ 31.250160] bus_for_each_drv+0x78/0xc0 [ 31.250184] __device_attach+0xb0/0x1b0 [ 31.250207] bus_probe_device+0x94/0xb0 [ 31.250230] device_add+0x64a/0x860 [ 31.250252] hid_add_device+0xe5/0x240 [ 31.250279] usbhid_probe+0x4dc/0x620 [ 31.250303] usb_probe_interface+0xe4/0x2a0 [ 31.250329] really_probe+0xde/0x340 [ 31.250353] __driver_probe_device+0x78/0x110 [ 31.250377] driver_probe_device+0x1f/0xa0 [ 31.250404] __device_attach_driver+0x85/0x110 [ 31.250431] bus_for_each_drv+0x78/0xc0 [ 31.250455] __device_attach+0xb0/0x1b0 [ 31.250478] bus_probe_device+0x94/0xb0 [ 31.250501] device_add+0x64a/0x860 [ 31.250523] usb_set_configuration+0x606/0x8a0 [ 31.250552] usb_generic_driver_probe+0x3e/0x60 [ 31.250579] usb_probe_device+0x3d/0x120 [ 31.250605] really_probe+0xde/0x340 [ 31.250629] __driver_probe_device+0x78/0x110 [ 31.250653] driver_probe_device+0x1f/0xa0 [ 31.250680] __device_attach_driver+0x85/0x110 [ 31.250707] bus_for_each_drv+0x78/0xc0 [ 31.250731] __device_attach+0xb0/0x1b0 [ 31.250753] bus_probe_device+0x94/0xb0 [ 31.250776] device_add+0x64a/0x860 [ 31.250798] usb_new_device.cold+0x141/0x38f [ 31.250828] hub_event+0x1166/0x1980 [ 31.250854] process_one_work+0x20f/0x580 [ 31.250879] worker_thread+0x1d1/0x3b0 [ 31.250904] kthread+0xee/0x120 [ 31.250926] ret_from_fork+0x30/0x50 [ 31.250954] ret_from_fork_asm+0x1a/0x30 [ 31.250982] -> #1 (triggers_list_lock){++++}-{4:4}: [ 31.251022] down_write+0x3b/0xd0 [ 31.251045] led_trigger_register+0x40/0x1b0 [ 31.251074] power_supply_register_led_trigger+0x88/0x150 [ 31.251107] power_supply_create_triggers+0x55/0xe0 [ 31.251135] __power_supply_register.part.0+0x34e/0x4a0 [ 31.251164] devm_power_supply_register+0x70/0xc0 [ 31.251190] bq27xxx_battery_setup+0x1a1/0x6d0 [bq27xxx_battery] [ 31.251235] bq27xxx_battery_i2c_probe+0xe5/0x17f [bq27xxx_battery_i2c] [ 31.251272] i2c_device_probe+0x125/0x2b0 [ 31.251299] really_probe+0xde/0x340 [ 31.251324] __driver_probe_device+0x78/0x110 [ 31.251348] driver_probe_device+0x1f/0xa0 [ 31.251375] __driver_attach+0xba/0x1c0 [ 31.251398] bus_for_each_dev+0x6b/0xb0 [ 31.251421] bus_add_driver+0x111/0x1f0 [ 31.251445] driver_register+0x6e/0xc0 [ 31.251470] i2c_register_driver+0x41/0xb0 [ 31.251498] do_one_initcall+0x5e/0x3a0 [ 31.251522] do_init_module+0x60/0x220 [ 31.251550] __do_sys_init_module+0x15f/0x190 [ 31.251575] do_syscall_64+0x93/0x180 [ 31.251598] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 31.251629] -> #0 (&psy->extensions_sem){.+.+}-{4:4}: [ 31.251668] __lock_acquire+0x13ce/0x21c0 [ 31.251694] lock_acquire+0xcf/0x2e0 [ 31.251719] down_read+0x3e/0x170 [ 31.251741] power_supply_get_property.part.0+0x22/0x150 [ 31.251774] power_supply_update_leds+0x8d/0x230 [ 31.251804] power_supply_led_trigger_activate+0x18/0x20 [ 31.251837] led_trigger_set+0x1fc/0x300 [ 31.251863] led_trigger_set_default+0x90/0xe0 [ 31.251892] led_classdev_register_ext+0x311/0x3a0 [ 31.251921] devm_led_classdev_multicolor_register_ext+0x6e/0xb80 [led_class_multicolor] [ 31.251969] ktd202x_probe+0x464/0x5c0 [leds_ktd202x] [ 31.252002] i2c_device_probe+0x125/0x2b0 [ 31.252027] really_probe+0xde/0x340 [ 31.252052] __driver_probe_device+0x78/0x110 [ 31.252076] driver_probe_device+0x1f/0xa0 [ 31.252103] __driver_attach+0xba/0x1c0 [ 31.252125] bus_for_each_dev+0x6b/0xb0 [ 31.252148] bus_add_driver+0x111/0x1f0 [ 31.252172] driver_register+0x6e/0xc0 [ 31.252197] i2c_register_driver+0x41/0xb0 [ 31.252225] do_one_initcall+0x5e/0x3a0 [ 31.252248] do_init_module+0x60/0x220 [ 31.252274] __do_sys_init_module+0x15f/0x190 [ 31.253986] do_syscall_64+0x93/0x180 [ 31.255826] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 31.257614] other info that might help us debug this: [ 31.257619] Chain exists of: &psy->extensions_sem --> triggers_list_lock --> &led_cdev->trigger_lock [ 31.257630] Possible unsafe locking scenario: [ 31.257632] CPU0 CPU1 [ 31.257633] ---- ---- [ 31.257634] lock(&led_cdev->trigger_lock); [ 31.257637] lock(triggers_list_lock); [ 31.257640] lock(&led_cdev->trigger_lock); [ 31.257643] rlock(&psy->extensions_sem); [ 31.257646] *** DEADLOCK *** [ 31.289433] 4 locks held by (udev-worker)/553: [ 31.289443] #0: ffff892ad9658108 (&dev->mutex){....}-{4:4}, at: __driver_attach+0xaf/0x1c0 [ 31.289463] #1: ffff892adbc0bbc8 (&led_cdev->led_access){+.+.}-{4:4}, at: led_classdev_register_ext+0x1c7/0x3a0 [ 31.289476] #2: ffffffffad0e30b0 (triggers_list_lock){++++}-{4:4}, at: led_trigger_set_default+0x2c/0xe0 [ 31.289487] #3: ffff892adbc0bad0 (&led_cdev->trigger_lock){+.+.}-{4:4}, at: led_trigger_set_default+0x34/0xe0 Fixes: 6037802bbae8 ("power: supply: core: implement extension API") Cc: Thomas Weißschuh Cc: Armin Wolf Signed-off-by: Hans de Goede Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250130140035.20636-1-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index d0bb52a7a0367..76c340b38015a 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -1592,11 +1592,11 @@ __power_supply_register(struct device *parent, if (rc) goto register_thermal_failed; - scoped_guard(rwsem_read, &psy->extensions_sem) { - rc = power_supply_create_triggers(psy); - if (rc) - goto create_triggers_failed; + rc = power_supply_create_triggers(psy); + if (rc) + goto create_triggers_failed; + scoped_guard(rwsem_read, &psy->extensions_sem) { rc = power_supply_add_hwmon_sysfs(psy); if (rc) goto add_hwmon_sysfs_failed; -- GitLab From d97505baea64d93538b16baf14ce7b8c1fbad746 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 19 Jan 2025 14:36:13 +0200 Subject: [PATCH 143/989] RDMA/mlx5: Fix the recovery flow of the UMR QP This patch addresses an issue in the recovery flow of the UMR QP, ensuring tasks do not get stuck, as highlighted by the call trace [1]. During recovery, before transitioning the QP to the RESET state, the software must wait for all outstanding WRs to complete. Failing to do so can cause the firmware to skip sending some flushed CQEs with errors and simply discard them upon the RESET, as per the IB specification. This race condition can result in lost CQEs and tasks becoming stuck. To resolve this, the patch sends a final WR which serves only as a barrier before moving the QP state to RESET. Once a CQE is received for that final WR, it guarantees that no outstanding WRs remain, making it safe to transition the QP to RESET and subsequently back to RTS, restoring proper functionality. Note: For the barrier WR, we simply reuse the failed and ready WR. Since the QP is in an error state, it will only receive IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier we don't care about its status. [1] INFO: task rdma_resource_l:1922 blocked for more than 120 seconds. Tainted: G W 6.12.0-rc7+ #1626 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:rdma_resource_l state:D stack:0 pid:1922 tgid:1922 ppid:1369 flags:0x00004004 Call Trace: __schedule+0x420/0xd30 schedule+0x47/0x130 schedule_timeout+0x280/0x300 ? mark_held_locks+0x48/0x80 ? lockdep_hardirqs_on_prepare+0xe5/0x1a0 wait_for_completion+0x75/0x130 mlx5r_umr_post_send_wait+0x3c2/0x5b0 [mlx5_ib] ? __pfx_mlx5r_umr_done+0x10/0x10 [mlx5_ib] mlx5r_umr_revoke_mr+0x93/0xc0 [mlx5_ib] __mlx5_ib_dereg_mr+0x299/0x520 [mlx5_ib] ? _raw_spin_unlock_irq+0x24/0x40 ? wait_for_completion+0xfe/0x130 ? rdma_restrack_put+0x63/0xe0 [ib_core] ib_dereg_mr_user+0x5f/0x120 [ib_core] ? lock_release+0xc6/0x280 destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs] uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs] uobj_destroy+0x3f/0x70 [ib_uverbs] ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs] ? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs] ? __lock_acquire+0x64e/0x2080 ? mark_held_locks+0x48/0x80 ? find_held_lock+0x2d/0xa0 ? lock_acquire+0xc1/0x2f0 ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] ? __fget_files+0xc3/0x1b0 ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs] ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] __x64_sys_ioctl+0x1b0/0xa70 do_syscall_64+0x6b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f99c918b17b RSP: 002b:00007ffc766d0468 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffc766d0578 RCX: 00007f99c918b17b RDX: 00007ffc766d0560 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffc766d0540 R08: 00007f99c8f99010 R09: 000000000000bd7e R10: 00007f99c94c1c70 R11: 0000000000000246 R12: 00007ffc766d0530 R13: 000000000000001c R14: 0000000040246a80 R15: 0000000000000000 Fixes: 158e71bb69e3 ("RDMA/mlx5: Add a umr recovery flow") Signed-off-by: Yishai Hadas Reviewed-by: Michael Guralnik Link: https://patch.msgid.link/27b51b92ec42dfb09d8096fcbd51878f397ce6ec.1737290141.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/umr.c | 83 +++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 887fd6fa3ba93..793f3c5c4d012 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -231,30 +231,6 @@ void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) ib_dealloc_pd(dev->umrc.pd); } -static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) -{ - struct umr_common *umrc = &dev->umrc; - struct ib_qp_attr attr; - int err; - - attr.qp_state = IB_QPS_RESET; - err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); - if (err) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto err; - } - - err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); - if (err) - goto err; - - umrc->state = MLX5_UMR_STATE_ACTIVE; - return 0; - -err: - umrc->state = MLX5_UMR_STATE_ERR; - return err; -} static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) @@ -302,6 +278,61 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, return err; } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_context *umr_context, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + mutex_lock(&umrc->lock); + /* Preventing any further WRs to be sent now */ + if (umrc->state != MLX5_UMR_STATE_RECOVER) { + mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n", + umrc->state); + umrc->state = MLX5_UMR_STATE_RECOVER; + } + mutex_unlock(&umrc->lock); + + /* Sending a final/barrier WR (the failed one) and wait for its completion. + * This will ensure that all the previous WRs got a completion before + * we set the QP state to RESET. + */ + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe, + with_data); + if (err) { + mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err); + goto err; + } + + /* Since the QP is in an error state, it will only receive + * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier + * we don't care about its status. + */ + wait_for_completion(&umr_context->done); + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) { + mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err); + goto err; + } + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) { struct mlx5_ib_umr_context *context = @@ -366,9 +397,7 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5_ib_warn(dev, "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n", umr_context.status, mkey); - mutex_lock(&umrc->lock); - err = mlx5r_umr_recover(dev); - mutex_unlock(&umrc->lock); + err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data); if (err) mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", err); -- GitLab From 12d044770e12c4205fa69535b4fa8a9981fea98f Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 19 Jan 2025 14:39:46 +0200 Subject: [PATCH 144/989] IB/mlx5: Set and get correct qp_num for a DCT QP When a DCT QP is created on an active lag, it's dctc.port is assigned in a round-robin way, which is from 1 to dev->lag_port. In this case when querying this QP, we may get qp_attr.port_num > 2. Fix this by setting qp->port when modifying a DCT QP, and read port_num from qp->port instead of dctc.port when querying it. Fixes: 7c4b1ab9f167 ("IB/mlx5: Add DCT RoCE LAG support") Signed-off-by: Mark Zhang Reviewed-by: Maher Sanalla Link: https://patch.msgid.link/94c76bf0adbea997f87ffa27674e0a7118ad92a9.1737290358.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a43eba9d3572c..08d22db8dca91 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4579,6 +4579,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); + + qp->port = attr->port_num; } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; @@ -5074,7 +5076,7 @@ static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, } if (qp_attr_mask & IB_QP_PORT) - qp_attr->port_num = MLX5_GET(dctc, dctc, port); + qp_attr->port_num = mqp->port; if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); if (qp_attr_mask & IB_QP_AV) { -- GitLab From 98380110bd48fbfd6a798ee11fffff893d36062c Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Fri, 31 Jan 2025 17:14:51 -0600 Subject: [PATCH 145/989] power: supply: axp20x_battery: Fix fault handling for AXP717 Correct the fault handling for the AXP717 by changing the i2c write from regmap_update_bits() to regmap_write_bits(). The update bits function does not work properly on a RW1C register where we must write a 1 back to an existing register to clear it. Additionally, as part of this testing I confirmed the behavior of errors reappearing, so remove comment about assumptions. Fixes: 6625767049c2 ("power: supply: axp20x_battery: add support for AXP717") Signed-off-by: Chris Morgan Reviewed-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20250131231455.153447-2-macroalpha82@gmail.com Signed-off-by: Sebastian Reichel --- drivers/power/supply/axp20x_battery.c | 31 +++++++++++++-------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/power/supply/axp20x_battery.c b/drivers/power/supply/axp20x_battery.c index fa27195f074e7..3c3158f31a484 100644 --- a/drivers/power/supply/axp20x_battery.c +++ b/drivers/power/supply/axp20x_battery.c @@ -466,10 +466,9 @@ static int axp717_battery_get_prop(struct power_supply *psy, /* * If a fault is detected it must also be cleared; if the - * condition persists it should reappear (This is an - * assumption, it's actually not documented). A restart was - * not sufficient to clear the bit in testing despite the - * register listed as POR. + * condition persists it should reappear. A restart was not + * sufficient to clear the bit in testing despite the register + * listed as POR. */ case POWER_SUPPLY_PROP_HEALTH: ret = regmap_read(axp20x_batt->regmap, AXP717_PMU_FAULT, @@ -480,26 +479,26 @@ static int axp717_battery_get_prop(struct power_supply *psy, switch (reg & AXP717_BATT_PMU_FAULT_MASK) { case AXP717_BATT_UVLO_2_5V: val->intval = POWER_SUPPLY_HEALTH_DEAD; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_UVLO_2_5V, - AXP717_BATT_UVLO_2_5V); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_UVLO_2_5V, + AXP717_BATT_UVLO_2_5V); return 0; case AXP717_BATT_OVER_TEMP: val->intval = POWER_SUPPLY_HEALTH_HOT; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_OVER_TEMP, - AXP717_BATT_OVER_TEMP); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_OVER_TEMP, + AXP717_BATT_OVER_TEMP); return 0; case AXP717_BATT_UNDER_TEMP: val->intval = POWER_SUPPLY_HEALTH_COLD; - regmap_update_bits(axp20x_batt->regmap, - AXP717_PMU_FAULT, - AXP717_BATT_UNDER_TEMP, - AXP717_BATT_UNDER_TEMP); + regmap_write_bits(axp20x_batt->regmap, + AXP717_PMU_FAULT, + AXP717_BATT_UNDER_TEMP, + AXP717_BATT_UNDER_TEMP); return 0; default: -- GitLab From ac5a41b472b4ef8bb37d7550796d059b377b4646 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Mon, 27 Jan 2025 21:12:02 +0100 Subject: [PATCH 146/989] Revert "mmc: sdhci_am654: Add sdhci_am654_start_signal_voltage_switch" This reverts commit 941a7abd4666912b84ab209396fdb54b0dae685d. This commit uses presence of device-tree properties vmmc-supply and vqmmc-supply for deciding whether to enable a quirk affecting timing of clock and data. The intention was to address issues observed with eMMC and SD on AM62 platforms. This new quirk is however also enabled for AM64 breaking microSD access on the SolidRun HimmingBoard-T which is supported in-tree since v6.11, causing a regression. During boot microSD initialization now fails with the error below: [ 2.008520] mmc1: SDHCI controller on fa00000.mmc [fa00000.mmc] using ADMA 64-bit [ 2.115348] mmc1: error -110 whilst initialising SD card The heuristics for enabling the quirk are clearly not correct as they break at least one but potentially many existing boards. Revert the change and restore original behaviour until a more appropriate method of selecting the quirk is derived. Fixes: 941a7abd4666 ("mmc: sdhci_am654: Add sdhci_am654_start_signal_voltage_switch") Closes: https://lore.kernel.org/linux-mmc/a70fc9fc-186f-4165-a652-3de50733763a@solid-run.com/ Cc: stable@vger.kernel.org Signed-off-by: Josua Mayer Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250127-am654-mmc-regression-v2-1-9bb39fb12810@solid-run.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index b73f673db92bb..f75c31815ab00 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -155,7 +155,6 @@ struct sdhci_am654_data { u32 tuning_loop; #define SDHCI_AM654_QUIRK_FORCE_CDTEST BIT(0) -#define SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA BIT(1) }; struct window { @@ -357,29 +356,6 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host, sdhci_set_clock(host, clock); } -static int sdhci_am654_start_signal_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios) -{ - struct sdhci_host *host = mmc_priv(mmc); - struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); - struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host); - int ret; - - if ((sdhci_am654->quirks & SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA) && - ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) { - if (!IS_ERR(mmc->supply.vqmmc)) { - ret = mmc_regulator_set_vqmmc(mmc, ios); - if (ret < 0) { - pr_err("%s: Switching to 1.8V signalling voltage failed,\n", - mmc_hostname(mmc)); - return -EIO; - } - } - return 0; - } - - return sdhci_start_signal_voltage_switch(mmc, ios); -} - static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg) { writeb(val, host->ioaddr + reg); @@ -868,11 +844,6 @@ static int sdhci_am654_get_of_property(struct platform_device *pdev, if (device_property_read_bool(dev, "ti,fails-without-test-cd")) sdhci_am654->quirks |= SDHCI_AM654_QUIRK_FORCE_CDTEST; - /* Suppress v1p8 ena for eMMC and SD with vqmmc supply */ - if (!!of_parse_phandle(dev->of_node, "vmmc-supply", 0) == - !!of_parse_phandle(dev->of_node, "vqmmc-supply", 0)) - sdhci_am654->quirks |= SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA; - sdhci_get_of_property(pdev); return 0; @@ -969,7 +940,6 @@ static int sdhci_am654_probe(struct platform_device *pdev) goto err_pltfm_free; } - host->mmc_host_ops.start_signal_voltage_switch = sdhci_am654_start_signal_voltage_switch; host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning; pm_runtime_get_noresume(dev); -- GitLab From 3e68abf2b9cebe76c6cd4b1aca8e95cd671035a3 Mon Sep 17 00:00:00 2001 From: Andy-ld Lu Date: Thu, 23 Jan 2025 17:26:01 +0800 Subject: [PATCH 147/989] mmc: mtk-sd: Fix register settings for hs400(es) mode For hs400(es) mode, the 'hs400-ds-delay' is typically configured in the dts. However, some projects may only define 'mediatek,hs400-ds-dly3', which can lead to initialization failures in hs400es mode. CMD13 reported response crc error in the mmc_switch_status() just after switching to hs400es mode. [ 1.914038][ T82] mmc0: mmc_select_hs400es failed, error -84 [ 1.914954][ T82] mmc0: error -84 whilst initialising MMC card Currently, the hs400_ds_dly3 value is set within the tuning function. This means that the PAD_DS_DLY3 field is not configured before tuning process, which is the reason for the above-mentioned CMD13 response crc error. Move the PAD_DS_DLY3 field configuration into msdc_prepare_hs400_tuning(), and add a value check of hs400_ds_delay to prevent overwriting by zero when the 'hs400-ds-delay' is not set in the dts. In addition, since hs400(es) only tune the PAD_DS_DLY1, the PAD_DS_DLY2_SEL bit should be cleared to bypass it. Fixes: c4ac38c6539b ("mmc: mtk-sd: Add HS400 online tuning support") Signed-off-by: Andy-ld Lu Reviewed-by: AngeloGioacchino Del Regno Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250123092644.7359-1-andy-ld.lu@mediatek.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 4b6e913725260..345ea91629e0f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -273,6 +273,7 @@ #define MSDC_PAD_TUNE_CMD2_SEL BIT(21) /* RW */ #define PAD_DS_TUNE_DLY_SEL BIT(0) /* RW */ +#define PAD_DS_TUNE_DLY2_SEL BIT(1) /* RW */ #define PAD_DS_TUNE_DLY1 GENMASK(6, 2) /* RW */ #define PAD_DS_TUNE_DLY2 GENMASK(11, 7) /* RW */ #define PAD_DS_TUNE_DLY3 GENMASK(16, 12) /* RW */ @@ -318,6 +319,7 @@ /* EMMC50_PAD_DS_TUNE mask */ #define PAD_DS_DLY_SEL BIT(16) /* RW */ +#define PAD_DS_DLY2_SEL BIT(15) /* RW */ #define PAD_DS_DLY1 GENMASK(14, 10) /* RW */ #define PAD_DS_DLY3 GENMASK(4, 0) /* RW */ @@ -2504,13 +2506,23 @@ static int msdc_execute_tuning(struct mmc_host *mmc, u32 opcode) static int msdc_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_ios *ios) { struct msdc_host *host = mmc_priv(mmc); + host->hs400_mode = true; - if (host->top_base) - writel(host->hs400_ds_delay, - host->top_base + EMMC50_PAD_DS_TUNE); - else - writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + if (host->top_base) { + if (host->hs400_ds_dly3) + sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, + host->top_base + EMMC50_PAD_DS_TUNE); + } else { + if (host->hs400_ds_dly3) + sdr_set_field(host->base + PAD_DS_TUNE, + PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + if (host->hs400_ds_delay) + writel(host->hs400_ds_delay, host->base + PAD_DS_TUNE); + } /* hs400 mode must set it to 0 */ sdr_clr_bits(host->base + MSDC_PATCH_BIT2, MSDC_PATCH_BIT2_CFGCRCSTS); /* to improve read performance, set outstanding to 2 */ @@ -2530,14 +2542,11 @@ static int msdc_execute_hs400_tuning(struct mmc_host *mmc, struct mmc_card *card if (host->top_base) { sdr_set_bits(host->top_base + EMMC50_PAD_DS_TUNE, PAD_DS_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->top_base + EMMC50_PAD_DS_TUNE, - PAD_DS_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->top_base + EMMC50_PAD_DS_TUNE, + PAD_DS_DLY2_SEL); } else { sdr_set_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY_SEL); - if (host->hs400_ds_dly3) - sdr_set_field(host->base + PAD_DS_TUNE, - PAD_DS_TUNE_DLY3, host->hs400_ds_dly3); + sdr_clr_bits(host->base + PAD_DS_TUNE, PAD_DS_TUNE_DLY2_SEL); } host->hs400_tuning = true; -- GitLab From 6f36f103cff1737094f2187b1f9a7b312820d377 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 15:10:27 +0200 Subject: [PATCH 148/989] pinctrl: cy8c95x0: Fix off-by-one in the regmap range settings The range_max is inclusive, so we need to use the number of the last accessible register address. Fixes: 8670de9fae49 ("pinctrl: cy8c95x0: Use regmap ranges") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250203131506.3318201-2-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 0d6c2027d4c18..5c6bcbf6c3377 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1438,15 +1438,15 @@ static int cy8c95x0_probe(struct i2c_client *client) switch (chip->tpin) { case 20: strscpy(chip->name, cy8c95x0_id[0].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 3 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 3 * MUXED_STRIDE - 1; break; case 40: strscpy(chip->name, cy8c95x0_id[1].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 6 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 6 * MUXED_STRIDE - 1; break; case 60: strscpy(chip->name, cy8c95x0_id[2].name); - regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 8 * MUXED_STRIDE; + regmap_range_conf.range_max = CY8C95X0_VIRTUAL + 8 * MUXED_STRIDE - 1; break; default: return -ENODEV; -- GitLab From 3fbe3fe28764455e4fc3578afb9765f46f9ce93d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 15:10:28 +0200 Subject: [PATCH 149/989] pinctrl: cy8c95x0: Avoid accessing reserved registers The checks for vrtual registers in the cy8c95x0_readable_register() and cy8c95x0_writeable_register() are not aligned and broken. Fix that by explicitly avoiding reserved registers to be accessed. Fixes: 71e4001a0455 ("pinctrl: pinctrl-cy8c95x0: Fix regcache") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250203131506.3318201-3-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 5c6bcbf6c3377..c787a9aadfdfb 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -328,14 +328,14 @@ static int cypress_get_pin_mask(struct cy8c95x0_pinctrl *chip, unsigned int pin) static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) { /* - * Only 12 registers are present per port (see Table 6 in the - * datasheet). + * Only 12 registers are present per port (see Table 6 in the datasheet). */ - if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) < 12) - return true; + if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) >= 12) + return false; switch (reg) { case 0x24 ... 0x27: + case 0x31 ... 0x3f: return false; default: return true; @@ -344,8 +344,11 @@ static bool cy8c95x0_readable_register(struct device *dev, unsigned int reg) static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) { - if (reg >= CY8C95X0_VIRTUAL) - return true; + /* + * Only 12 registers are present per port (see Table 6 in the datasheet). + */ + if (reg >= CY8C95X0_VIRTUAL && (reg % MUXED_STRIDE) >= 12) + return false; switch (reg) { case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): @@ -353,6 +356,7 @@ static bool cy8c95x0_writeable_register(struct device *dev, unsigned int reg) case CY8C95X0_DEVID: return false; case 0x24 ... 0x27: + case 0x31 ... 0x3f: return false; default: return true; -- GitLab From aac4470fa6e695e4d6ac94cc77d4690b57f1d2bc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 15:10:29 +0200 Subject: [PATCH 150/989] pinctrl: cy8c95x0: Enable regmap locking for debug When regmap locking is disabled, debugfs is also disabled. Enable locking for debug when CONFIG_DEBUG_PINCTRL is set. Fixes: f71aba339a66 ("pinctrl: cy8c95x0: Use single I2C lock") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250203131506.3318201-4-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index c787a9aadfdfb..bfa16f70e29ce 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -470,7 +470,11 @@ static const struct regmap_config cy8c9520_i2c_regmap = { .max_register = 0, /* Updated at runtime */ .num_reg_defaults_raw = 0, /* Updated at runtime */ .use_single_read = true, /* Workaround for regcache bug */ +#if IS_ENABLED(CONFIG_DEBUG_PINCTRL) + .disable_locking = false, +#else .disable_locking = true, +#endif }; static inline int cy8c95x0_regmap_update_bits_base(struct cy8c95x0_pinctrl *chip, -- GitLab From 0a7404fc5399e1100b14e7e2a4af2e4fd5e3b602 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 15:10:30 +0200 Subject: [PATCH 151/989] pinctrl: cy8c95x0: Rename PWMSEL to SELPWM There are two registers in the hardware, one, "Select PWM", is per-port configuration enabling PWM function instead of GPIO. The other one is "PWM Select" is per-PWM selector to configure PWM itself. Original code uses abbreviation of the latter to describe the former. Rename it to follow the datasheet. Fixes: e6cbbe42944d ("pinctrl: Add Cypress cy8c95x0 support") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250203131506.3318201-5-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index bfa16f70e29ce..75100a9fb8e4c 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -42,7 +42,7 @@ #define CY8C95X0_PORTSEL 0x18 /* Port settings, write PORTSEL first */ #define CY8C95X0_INTMASK 0x19 -#define CY8C95X0_PWMSEL 0x1A +#define CY8C95X0_SELPWM 0x1A #define CY8C95X0_INVERT 0x1B #define CY8C95X0_DIRECTION 0x1C /* Drive mode register change state on writing '1' */ @@ -369,8 +369,8 @@ static bool cy8c95x0_volatile_register(struct device *dev, unsigned int reg) case CY8C95X0_INPUT_(0) ... CY8C95X0_INPUT_(7): case CY8C95X0_INTSTATUS_(0) ... CY8C95X0_INTSTATUS_(7): case CY8C95X0_INTMASK: + case CY8C95X0_SELPWM: case CY8C95X0_INVERT: - case CY8C95X0_PWMSEL: case CY8C95X0_DIRECTION: case CY8C95X0_DRV_PU: case CY8C95X0_DRV_PD: @@ -399,7 +399,7 @@ static bool cy8c95x0_muxed_register(unsigned int reg) { switch (reg) { case CY8C95X0_INTMASK: - case CY8C95X0_PWMSEL: + case CY8C95X0_SELPWM: case CY8C95X0_INVERT: case CY8C95X0_DIRECTION: case CY8C95X0_DRV_PU: @@ -797,7 +797,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, reg = CY8C95X0_DIRECTION; break; case PIN_CONFIG_MODE_PWM: - reg = CY8C95X0_PWMSEL; + reg = CY8C95X0_SELPWM; break; case PIN_CONFIG_OUTPUT: reg = CY8C95X0_OUTPUT; @@ -876,7 +876,7 @@ static int cy8c95x0_gpio_set_pincfg(struct cy8c95x0_pinctrl *chip, reg = CY8C95X0_DRV_PP_FAST; break; case PIN_CONFIG_MODE_PWM: - reg = CY8C95X0_PWMSEL; + reg = CY8C95X0_SELPWM; break; case PIN_CONFIG_OUTPUT_ENABLE: return cy8c95x0_pinmux_direction(chip, off, !arg); @@ -1161,7 +1161,7 @@ static void cy8c95x0_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file * bitmap_zero(mask, MAX_LINE); __set_bit(pin, mask); - if (cy8c95x0_read_regs_mask(chip, CY8C95X0_PWMSEL, pwm, mask)) { + if (cy8c95x0_read_regs_mask(chip, CY8C95X0_SELPWM, pwm, mask)) { seq_puts(s, "not available"); return; } @@ -1206,7 +1206,7 @@ static int cy8c95x0_set_mode(struct cy8c95x0_pinctrl *chip, unsigned int off, bo u8 port = cypress_get_port(chip, off); u8 bit = cypress_get_pin_mask(chip, off); - return cy8c95x0_regmap_write_bits(chip, CY8C95X0_PWMSEL, port, bit, mode ? bit : 0); + return cy8c95x0_regmap_write_bits(chip, CY8C95X0_SELPWM, port, bit, mode ? bit : 0); } static int cy8c95x0_pinmux_mode(struct cy8c95x0_pinctrl *chip, -- GitLab From 448060463198924c0a485e7e1622fa8a9c03cf3e Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Tue, 17 Dec 2024 14:07:23 +0530 Subject: [PATCH 152/989] drm/i915/hdcp: Fix Repeater authentication during topology change When topology changes, before beginning a new HDCP authentication by sending AKE_init message we need to first authenticate only the repeater. Only after repeater authentication failure, it makes sense to start a new HDCP authentication. Even though it made sense to not enable HDCP directly from check_link and schedule it for later, repeater authentication needs to be done immediately. --v2 -Fix comment grammatical errors [Ankit] Fixes: 47ef55a8b784 ("drm/i915/hdcp: Don't enable HDCP2.2 directly from check_link") Signed-off-by: Suraj Kandpal Reviewed-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20241217083723.2883317-1-suraj.kandpal@intel.com (cherry picked from commit 605a33e765890e4f1345315afc25268d4ae0fb7c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_hdcp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index 7464b44c8bb36..c60b22aaa819e 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -2188,6 +2188,19 @@ static int intel_hdcp2_check_link(struct intel_connector *connector) drm_dbg_kms(display->drm, "HDCP2.2 Downstream topology change\n"); + + ret = hdcp2_authenticate_repeater_topology(connector); + if (!ret) { + intel_hdcp_update_value(connector, + DRM_MODE_CONTENT_PROTECTION_ENABLED, + true); + goto out; + } + + drm_dbg_kms(display->drm, + "[CONNECTOR:%d:%s] Repeater topology auth failed.(%d)\n", + connector->base.base.id, connector->base.name, + ret); } else { drm_dbg_kms(display->drm, "[CONNECTOR:%d:%s] HDCP2.2 link failed, retrying auth\n", -- GitLab From 8dd5a5eb6a209e3bdb4e536e36698400445c6c2e Mon Sep 17 00:00:00 2001 From: Suraj Kandpal Date: Fri, 17 Jan 2025 09:42:48 +0530 Subject: [PATCH 153/989] drm/i915/hdcp: Use correct function to check if encoder is HDMI Use intel_encoder_is_hdmi function which was recently introduced to see if encoder is HDMI or not. --v2 -Add Fixes tag [Jani] Fixes: 6a3691ca4799 ("drm/i915/hdcp: Disable HDCP Line Rekeying for HDCP2.2 on HDMI") Signed-off-by: Suraj Kandpal Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20250117041247.1084381-1-suraj.kandpal@intel.com (cherry picked from commit 2499212e21601740ed7d5563563f39cf7e7d833a) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_hdcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_hdcp.c b/drivers/gpu/drm/i915/display/intel_hdcp.c index c60b22aaa819e..1bab7c34a7942 100644 --- a/drivers/gpu/drm/i915/display/intel_hdcp.c +++ b/drivers/gpu/drm/i915/display/intel_hdcp.c @@ -41,7 +41,7 @@ intel_hdcp_adjust_hdcp_line_rekeying(struct intel_encoder *encoder, u32 rekey_bit = 0; /* Here we assume HDMI is in TMDS mode of operation */ - if (encoder->type != INTEL_OUTPUT_HDMI) + if (!intel_encoder_is_hdmi(encoder)) return; if (DISPLAY_VER(display) >= 30) { -- GitLab From cb5fab2afd906307876d79537ef0329033c40dd3 Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Thu, 23 Jan 2025 11:38:39 -0800 Subject: [PATCH 154/989] drm/i915/pmu: Fix zero delta busyness issue When running igt@gem_exec_balancer@individual for multiple iterations, it is seen that the delta busyness returned by PMU is 0. The issue stems from a combination of 2 implementation specific details: 1) gt_park is throttling __update_guc_busyness_stats() so that it does not hog PCI bandwidth for some use cases. (Ref: 59bcdb564b3ba) 2) busyness implementation always returns monotonically increasing counters. (Ref: cf907f6d29421) If an application queried an engine while it was active, engine->stats.guc.running is set to true. Following that, if all PM wakeref's are released, then gt is parked. At this time the throttling of __update_guc_busyness_stats() may result in a missed update to the running state of the engine (due to (1) above). This means subsequent calls to guc_engine_busyness() will think that the engine is still running and they will keep updating the cached counter (stats->total). This results in an inflated cached counter. Later when the application runs a workload and queries for busyness, we return the cached value since it is larger than the actual value (due to (2) above) All subsequent queries will return the same large (inflated) value, so the application sees a delta busyness of zero. Fix the issue by resetting the running state of engines each time intel_guc_busyness_park() is called. v2: (Rodrigo) - Use the correct tag in commit message - Drop the redundant wakeref check in guc_engine_busyness() and update commit message Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13366 Fixes: cf907f6d2942 ("i915/guc: Ensure busyness counter increases motonically") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250123193839.2394694-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 431b742e2bfc9f6dd713f261629741980996d001) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 12f1ba7ca9c19..bd4b3d2470e40 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1469,6 +1469,19 @@ static void __reset_guc_busyness_stats(struct intel_guc *guc) spin_unlock_irqrestore(&guc->timestamp.lock, flags); } +static void __update_guc_busyness_running_state(struct intel_guc *guc) +{ + struct intel_gt *gt = guc_to_gt(guc); + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long flags; + + spin_lock_irqsave(&guc->timestamp.lock, flags); + for_each_engine(engine, gt, id) + engine->stats.guc.running = false; + spin_unlock_irqrestore(&guc->timestamp.lock, flags); +} + static void __update_guc_busyness_stats(struct intel_guc *guc) { struct intel_gt *gt = guc_to_gt(guc); @@ -1619,6 +1632,9 @@ void intel_guc_busyness_park(struct intel_gt *gt) if (!guc_submission_initialized(guc)) return; + /* Assume no engines are running and set running state to false */ + __update_guc_busyness_running_state(guc); + /* * There is a race with suspend flow where the worker runs after suspend * and causes an unclaimed register access warning. Cancel the worker -- GitLab From fa6182c8b13ebfdc70ebdc09161a70dd8131f3b1 Mon Sep 17 00:00:00 2001 From: Brian Geffon Date: Mon, 27 Jan 2025 15:43:32 -0500 Subject: [PATCH 155/989] drm/i915: Fix page cleanup on DMA remap failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When converting to folios the cleanup path of shmem_get_pages() was missed. When a DMA remap fails and the max segment size is greater than PAGE_SIZE it will attempt to retry the remap with a PAGE_SIZEd segment size. The cleanup code isn't properly using the folio apis and as a result isn't handling compound pages correctly. v2 -> v3: (Ville) Just use shmem_sg_free_table() as-is in the failure path of shmem_get_pages(). shmem_sg_free_table() will clear mapping unevictable but it will be reset when it retries in shmem_sg_alloc_table(). v1 -> v2: (Ville) Fixed locations where we were not clearing mapping unevictable. Cc: stable@vger.kernel.org Cc: Ville Syrjala Cc: Vidya Srinivas Link: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13487 Link: https://lore.kernel.org/lkml/20250116135636.410164-1-bgeffon@google.com/ Fixes: 0b62af28f249 ("i915: convert shmem_sg_free_table() to use a folio_batch") Signed-off-by: Brian Geffon Suggested-by: Tomasz Figa Link: https://patchwork.freedesktop.org/patch/msgid/20250127204332.336665-1-bgeffon@google.com Reviewed-by: Jonathan Cavitt Tested-by: Vidya Srinivas Signed-off-by: Ville Syrjälä (cherry picked from commit 9e304a18630875352636ad52a3d2af47c3bde824) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index fe69f2c8527d7..ae3343c81a645 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -209,8 +209,6 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj) struct address_space *mapping = obj->base.filp->f_mapping; unsigned int max_segment = i915_sg_segment_size(i915->drm.dev); struct sg_table *st; - struct sgt_iter sgt_iter; - struct page *page; int ret; /* @@ -239,9 +237,7 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj) * for PAGE_SIZE chunks instead may be helpful. */ if (max_segment > PAGE_SIZE) { - for_each_sgt_page(page, sgt_iter, st) - put_page(page); - sg_free_table(st); + shmem_sg_free_table(st, mapping, false, false); kfree(st); max_segment = PAGE_SIZE; -- GitLab From c7b49506b3ba7a62335e6f666a43f67d5cd9fd1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 18 Dec 2024 19:36:47 +0200 Subject: [PATCH 156/989] drm/i915: Drop 64bpp YUV formats from ICL+ SDR planes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I'm seeing underruns with these 64bpp YUV formats on TGL. The weird details: - only happens on pipe B/C/D SDR planes, pipe A SDR planes seem fine, as do all HDR planes - somehow CDCLK related, higher CDCLK allows for bigger plane with these formats without underruns. With 300MHz CDCLK I can only go up to 1200 pixels wide or so, with 650MHz even a 3840 pixel wide plane was OK - ICL and ADL so far appear unaffected So not really sure what's the deal with this, but bspec does state "64-bit formats supported only on the HDR planes" so let's just drop these formats from the SDR planes. We already disallow 64bpp RGB formats. Cc: stable@vger.kernel.org Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20241218173650.19782-2-ville.syrjala@linux.intel.com Reviewed-by: Juha-Pekka Heikkila (cherry picked from commit 35e1aacfe536d6e8d8d440cd7155366da2541ad4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/skl_universal_plane.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane.c b/drivers/gpu/drm/i915/display/skl_universal_plane.c index ff9764cac1e71..80e558042d97d 100644 --- a/drivers/gpu/drm/i915/display/skl_universal_plane.c +++ b/drivers/gpu/drm/i915/display/skl_universal_plane.c @@ -106,8 +106,6 @@ static const u32 icl_sdr_y_plane_formats[] = { DRM_FORMAT_Y216, DRM_FORMAT_XYUV8888, DRM_FORMAT_XVYU2101010, - DRM_FORMAT_XVYU12_16161616, - DRM_FORMAT_XVYU16161616, }; static const u32 icl_sdr_uv_plane_formats[] = { @@ -134,8 +132,6 @@ static const u32 icl_sdr_uv_plane_formats[] = { DRM_FORMAT_Y216, DRM_FORMAT_XYUV8888, DRM_FORMAT_XVYU2101010, - DRM_FORMAT_XVYU12_16161616, - DRM_FORMAT_XVYU16161616, }; static const u32 icl_hdr_plane_formats[] = { -- GitLab From 57965269896313e1629a518d3971ad55f599b792 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 14 Jan 2025 16:13:34 -0800 Subject: [PATCH 157/989] drm/i915/guc: Debug print LRC state entries only if the context is pinned After the context is unpinned the backing memory can also be unpinned, so any accesses via the lrc_reg_state pointer can end up in unmapped memory. To avoid that, make sure to only access that memory if the context is pinned when printing its info. v2: fix newline alignment Fixes: 28ff6520a34d ("drm/i915/guc: Update GuC debugfs to support new GuC") Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Cc: Matthew Brost Cc: # v5.15+ Reviewed-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20250115001334.3875347-1-daniele.ceraolospurio@intel.com (cherry picked from commit 5bea40687c5cf2a33bf04e9110eb2e2b80222ef5) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index bd4b3d2470e40..cc05bd9e43b49 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -5535,12 +5535,20 @@ static inline void guc_log_context(struct drm_printer *p, { drm_printf(p, "GuC lrc descriptor %u:\n", ce->guc_id.id); drm_printf(p, "\tHW Context Desc: 0x%08x\n", ce->lrc.lrca); - drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", - ce->ring->head, - ce->lrc_reg_state[CTX_RING_HEAD]); - drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", - ce->ring->tail, - ce->lrc_reg_state[CTX_RING_TAIL]); + if (intel_context_pin_if_active(ce)) { + drm_printf(p, "\t\tLRC Head: Internal %u, Memory %u\n", + ce->ring->head, + ce->lrc_reg_state[CTX_RING_HEAD]); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory %u\n", + ce->ring->tail, + ce->lrc_reg_state[CTX_RING_TAIL]); + intel_context_unpin(ce); + } else { + drm_printf(p, "\t\tLRC Head: Internal %u, Memory not pinned\n", + ce->ring->head); + drm_printf(p, "\t\tLRC Tail: Internal %u, Memory not pinned\n", + ce->ring->tail); + } drm_printf(p, "\t\tContext Pin Count: %u\n", atomic_read(&ce->pin_count)); drm_printf(p, "\t\tGuC ID Ref Count: %u\n", -- GitLab From 4466302262b38f5e6c65325035b4036a42efc934 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Thu, 30 Jan 2025 10:46:06 +0530 Subject: [PATCH 158/989] drm/i915/dp: fix the Adaptive sync Operation mode for SDP Currently we support Adaptive sync operation mode with dynamic frame rate, but instead the operation mode with fixed rate is set. This was initially set correctly in the earlier version of changes but later got changed, while defining a macro for the same. Fixes: a5bd5991cb8a ("drm/i915/display: Compute AS SDP parameters") Cc: Mitul Golani Cc: Ankit Nautiyal Cc: Jani Nikula Reviewed-by: Mitul Golani Signed-off-by: Ankit Nautiyal Link: https://patchwork.freedesktop.org/patch/msgid/20250130051609.1796524-4-mitulkumar.ajitkumar.golani@intel.com (cherry picked from commit c5806862543ff6c2ad242409fcdf0667eac26dae) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index f1f3b1bb1e89b..bfc16fd25d22f 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2829,7 +2829,6 @@ static void intel_dp_compute_as_sdp(struct intel_dp *intel_dp, crtc_state->infoframes.enable |= intel_hdmi_infoframe_enable(DP_SDP_ADAPTIVE_SYNC); - /* Currently only DP_AS_SDP_AVT_FIXED_VTOTAL mode supported */ as_sdp->sdp_type = DP_SDP_ADAPTIVE_SYNC; as_sdp->length = 0x9; as_sdp->duration_incr_ms = 0; @@ -2840,7 +2839,7 @@ static void intel_dp_compute_as_sdp(struct intel_dp *intel_dp, as_sdp->target_rr = drm_mode_vrefresh(adjusted_mode); as_sdp->target_rr_divider = true; } else { - as_sdp->mode = DP_AS_SDP_AVT_FIXED_VTOTAL; + as_sdp->mode = DP_AS_SDP_AVT_DYNAMIC_VTOTAL; as_sdp->vtotal = adjusted_mode->vtotal; as_sdp->target_rr = 0; } -- GitLab From 985a44b02484a47f2c6ecbe971a5f0c47830120b Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Fri, 31 Jan 2025 09:43:42 +0530 Subject: [PATCH 159/989] drm/i915/dp: Return min bpc supported by source instead of 0 Currently, intel_dp_dsc_max_src_input_bpc can return 0 for platforms not supporting DSC, which could theoretically cause issues in clamp() due to a low limit being greater than the high limit. Instead, return the minimum bpc supported by the source to prevent such issues. Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/all/CA+G9fYtNfM399_=_ff81zeRJv=0+z7oFJfPGmJgTp6yrJmU+1w@mail.gmail.com/ Fixes: 160672b86b0d ("drm/i915/dp: Use clamp for pipe_bpp limits with DSC") Cc: Suraj Kandpal Cc: Jani Nikula Cc: Rodrigo Vivi Signed-off-by: Ankit Nautiyal Reviewed-by: Suraj Kandpal Tested-by: Chaitanya Kumar Borah Link: https://patchwork.freedesktop.org/patch/msgid/20250131041342.3086716-1-ankit.k.nautiyal@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit a67221b5eb8d59fb7e1f0df3ef9945b6a0f32cca) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index bfc16fd25d22f..be07034bfcc69 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1791,7 +1791,7 @@ int intel_dp_dsc_max_src_input_bpc(struct intel_display *display) if (DISPLAY_VER(display) == 11) return 10; - return 0; + return intel_dp_dsc_min_src_input_bpc(); } int intel_dp_dsc_compute_max_bpp(const struct intel_connector *connector, -- GitLab From 3cf3ec911d70ee7774978f639fd3364c98d42b2c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 21 Jan 2025 06:52:03 -0800 Subject: [PATCH 160/989] drm/i915/backlight: Return immediately when scale() finds invalid parameters The scale() functions detects invalid parameters, but continues its calculations anyway. This causes bad results if negative values are used for unsigned operations. Worst case, a division by 0 error will be seen if source_min == source_max. On top of that, after v6.13, the sequence of WARN_ON() followed by clamp() may result in a build error with gcc 13.x. drivers/gpu/drm/i915/display/intel_backlight.c: In function 'scale': include/linux/compiler_types.h:542:45: error: call to '__compiletime_assert_415' declared with attribute error: clamp() low limit source_min greater than high limit source_max This happens if the compiler decides to rearrange the code as follows. if (source_min > source_max) { WARN(..); /* Do the clamp() knowing that source_min > source_max */ source_val = clamp(source_val, source_min, source_max); } else { /* Do the clamp knowing that source_min <= source_max */ source_val = clamp(source_val, source_min, source_max); } Fix the problem by evaluating the return values from WARN_ON and returning immediately after a warning. While at it, fix divide by zero error seen if source_min == source_max. Analyzed-by: Linus Torvalds Suggested-by: Linus Torvalds Suggested-by: David Laight Cc: David Laight Cc: Jani Nikula Cc: Andy Shevchenko Signed-off-by: Guenter Roeck Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250121145203.2851237-1-linux@roeck-us.net Signed-off-by: Rodrigo Vivi (cherry picked from commit 6f71507415841d1a6d38118e5fa0eaf0caab9c17) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_backlight.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_backlight.c b/drivers/gpu/drm/i915/display/intel_backlight.c index fc1e517e074a3..7e6ce905bdafa 100644 --- a/drivers/gpu/drm/i915/display/intel_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_backlight.c @@ -41,8 +41,9 @@ static u32 scale(u32 source_val, { u64 target_val; - WARN_ON(source_min > source_max); - WARN_ON(target_min > target_max); + if (WARN_ON(source_min >= source_max) || + WARN_ON(target_min > target_max)) + return target_min; /* defensive */ source_val = clamp(source_val, source_min, source_max); -- GitLab From 6bb05a33337b2c842373857b63de5c9bf1ae2a09 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 31 Jan 2025 12:33:23 -0500 Subject: [PATCH 161/989] clocksource: Use migrate_disable() to avoid calling get_random_u32() in atomic context The following bug report happened with a PREEMPT_RT kernel: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2012, name: kwatchdog preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 get_random_u32+0x4f/0x110 clocksource_verify_choose_cpus+0xab/0x1a0 clocksource_verify_percpu.part.0+0x6b/0x330 clocksource_watchdog_kthread+0x193/0x1a0 It is due to the fact that clocksource_verify_choose_cpus() is invoked with preemption disabled. This function invokes get_random_u32() to obtain random numbers for choosing CPUs. The batched_entropy_32 local lock and/or the base_crng.lock spinlock in driver/char/random.c will be acquired during the call. In PREEMPT_RT kernel, they are both sleeping locks and so cannot be acquired in atomic context. Fix this problem by using migrate_disable() to allow smp_processor_id() to be reliably used without introducing atomic context. preempt_disable() is then called after clocksource_verify_choose_cpus() but before the clocksource measurement is being run to avoid introducing unexpected latency. Fixes: 7560c02bdffb ("clocksource: Check per-CPU clock synchronization when marked unstable") Suggested-by: Sebastian Andrzej Siewior Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Reviewed-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/all/20250131173323.891943-2-longman@redhat.com --- kernel/time/clocksource.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 77d9566d3aa68..2a7802ec480cc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -373,10 +373,10 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; @@ -384,6 +384,7 @@ void clocksource_verify_percpu(struct clocksource *cs) testcpu = smp_processor_id(); pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -403,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", -- GitLab From 4fd2707e3e71bfd5d4df4f4c9656a009f09dfc7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bence=20Cs=C3=B3k=C3=A1s?= Date: Mon, 3 Feb 2025 16:12:49 +0100 Subject: [PATCH 162/989] spi: atmel-quadspi: Fix warning in doc-comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The doc-comment for `struct atmel_qspi_pcal` had a typo in one of the struct members' name, causing a warning with the `W=1` option. Fixes: 5af42209a4d2 ("spi: atmel-quadspi: Add support for sama7g5 QSPI") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501311707.Ltj0qXse-lkp@intel.com/ Signed-off-by: Bence Csókás Link: https://patch.msgid.link/20250203151249.79876-2-csokas.bence@prolan.hu Signed-off-by: Mark Brown --- drivers/spi/atmel-quadspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/atmel-quadspi.c b/drivers/spi/atmel-quadspi.c index abdc49d9d9400..d8c9be64d006a 100644 --- a/drivers/spi/atmel-quadspi.c +++ b/drivers/spi/atmel-quadspi.c @@ -235,8 +235,8 @@ /** * struct atmel_qspi_pcal - Pad Calibration Clock Division * @pclk_rate: peripheral clock rate. - * @pclkdiv: calibration clock division. The clock applied to the calibration - * cell is divided by pclkdiv + 1. + * @pclk_div: calibration clock division. The clock applied to the calibration + * cell is divided by pclk_div + 1. */ struct atmel_qspi_pcal { u32 pclk_rate; -- GitLab From 9e8b21410f310c50733f6e1730bae5a8e30d3570 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Sat, 18 Jan 2025 11:31:33 +0530 Subject: [PATCH 163/989] usb: gadget: f_midi: Fixing wMaxPacketSize exceeded issue during MIDI bind retries The current implementation sets the wMaxPacketSize of bulk in/out endpoints to 1024 bytes at the end of the f_midi_bind function. However, in cases where there is a failure in the first midi bind attempt, consider rebinding. This scenario may encounter an f_midi_bind issue due to the previous bind setting the bulk endpoint's wMaxPacketSize to 1024 bytes, which exceeds the ep->maxpacket_limit where configured dwc3 TX/RX FIFO's maxpacket size of 512 bytes for IN/OUT endpoints in support HS speed only. Here the term "rebind" in this context refers to attempting to bind the MIDI function a second time in certain scenarios. The situations where rebinding is considered include: * When there is a failure in the first UDC write attempt, which may be caused by other functions bind along with MIDI. * Runtime composition change : Example : MIDI,ADB to MIDI. Or MIDI to MIDI,ADB. This commit addresses this issue by resetting the wMaxPacketSize before endpoint claim. And here there is no need to reset all values in the usb endpoint descriptor structure, as all members except wMaxPacketSize and bEndpointAddress have predefined values. This ensures that restores the endpoint to its expected configuration, and preventing conflicts with value of ep->maxpacket_limit. It also aligns with the approach used in other function drivers, which treat endpoint descriptors as if they were full speed before endpoint claim. Fixes: 46decc82ffd5 ("usb: gadget: unconditionally allocate hs/ss descriptor in bind operation") Cc: stable@vger.kernel.org Signed-off-by: Selvarasu Ganesan Link: https://lore.kernel.org/r/20250118060134.927-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 837fcdfa3840f..9b991cf5b0f8b 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -907,6 +907,15 @@ static int f_midi_bind(struct usb_configuration *c, struct usb_function *f) status = -ENODEV; + /* + * Reset wMaxPacketSize with maximum packet size of FS bulk transfer before + * endpoint claim. This ensures that the wMaxPacketSize does not exceed the + * limit during bind retries where configured dwc3 TX/RX FIFO's maxpacket + * size of 512 bytes for IN/OUT endpoints in support HS speed only. + */ + bulk_in_desc.wMaxPacketSize = cpu_to_le16(64); + bulk_out_desc.wMaxPacketSize = cpu_to_le16(64); + /* allocate instance-specific endpoints */ midi->in_ep = usb_ep_autoconfig(cdev->gadget, &bulk_in_desc); if (!midi->in_ep) -- GitLab From 309005e448c1f3e4b81e4416406991b7c3339c1d Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 20 Jan 2025 15:42:51 +0100 Subject: [PATCH 164/989] usb: phy: generic: Use proper helper for property detection Since commit c141ecc3cecd7 ("of: Warn when of_property_read_bool() is used on non-boolean properties") a warning is raised if this function is used for property detection. of_property_present() is the correct helper for this. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250120144251.580981-1-alexander.stein@ew.tq-group.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/phy-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/phy/phy-generic.c b/drivers/usb/phy/phy-generic.c index 6c3ececf91375..8423be59ec0ff 100644 --- a/drivers/usb/phy/phy-generic.c +++ b/drivers/usb/phy/phy-generic.c @@ -212,7 +212,7 @@ int usb_phy_gen_create_phy(struct device *dev, struct usb_phy_generic *nop) if (of_property_read_u32(node, "clock-frequency", &clk_rate)) clk_rate = 0; - needs_clk = of_property_read_bool(node, "clocks"); + needs_clk = of_property_present(node, "clocks"); } nop->gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_ASIS); -- GitLab From 1ed3af5a2aaefd0ecd887ecabdc8da07220e31fe Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Tue, 21 Jan 2025 23:11:23 +0000 Subject: [PATCH 165/989] usb: dwc3: Document nostream_work Add missing description to the nostream_work of dwc3_ep. The work is used by bulk multi-stream endpoints for a NoStream event to reinitiate the stream if needed. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250120182219.30dcb3c6@canb.auug.org.au/ Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/7cdeaa346d24907712aac533c1c5f90a03151189.1737500936.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index ac7c730f81acf..c955039bb4f62 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -717,6 +717,7 @@ struct dwc3_event_buffer { /** * struct dwc3_ep - device side endpoint representation * @endpoint: usb endpoint + * @nostream_work: work for handling bulk NoStream * @cancelled_list: list of cancelled requests for this endpoint * @pending_list: list of pending requests for this endpoint * @started_list: list of started requests on this endpoint -- GitLab From 335a1fc1193481f8027f176649c72868172f6f8b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 22 Jan 2025 03:12:31 -0500 Subject: [PATCH 166/989] usb: gadget: udc: renesas_usb3: Fix compiler warning drivers/usb/gadget/udc/renesas_usb3.c: In function 'renesas_usb3_probe': drivers/usb/gadget/udc/renesas_usb3.c:2638:73: warning: '%d' directive output may be truncated writing between 1 and 11 bytes into a region of size 6 [-Wformat-truncation=] 2638 | snprintf(usb3_ep->ep_name, sizeof(usb3_ep->ep_name), "ep%d", i); ^~~~~~~~~~~~~~~~~~~~~~~~ ^~ ^ Fixes: 746bfe63bba3 ("usb: gadget: renesas_usb3: add support for Renesas USB3.0 peripheral controller") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501201409.BIQPtkeB-lkp@intel.com/ Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20250122081231.47594-1-guoren@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/renesas_usb3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index fce5c41d9f298..89b304cf6d032 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -310,7 +310,7 @@ struct renesas_usb3_request { struct list_head queue; }; -#define USB3_EP_NAME_SIZE 8 +#define USB3_EP_NAME_SIZE 16 struct renesas_usb3_ep { struct usb_ep ep; struct renesas_usb3 *usb3; -- GitLab From 2240fed37afbcdb5e8b627bc7ad986891100e05d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 22 Jan 2025 14:26:17 -0500 Subject: [PATCH 167/989] USB: hub: Ignore non-compliant devices with too many configs or interfaces Robert Morris created a test program which can cause usb_hub_to_struct_hub() to dereference a NULL or inappropriate pointer: Oops: general protection fault, probably for non-canonical address 0xcccccccccccccccc: 0000 [#1] SMP DEBUG_PAGEALLOC PTI CPU: 7 UID: 0 PID: 117 Comm: kworker/7:1 Not tainted 6.13.0-rc3-00017-gf44d154d6e3d #14 Hardware name: FreeBSD BHYVE/BHYVE, BIOS 14.0 10/17/2021 Workqueue: usb_hub_wq hub_event RIP: 0010:usb_hub_adjust_deviceremovable+0x78/0x110 ... Call Trace: ? die_addr+0x31/0x80 ? exc_general_protection+0x1b4/0x3c0 ? asm_exc_general_protection+0x26/0x30 ? usb_hub_adjust_deviceremovable+0x78/0x110 hub_probe+0x7c7/0xab0 usb_probe_interface+0x14b/0x350 really_probe+0xd0/0x2d0 ? __pfx___device_attach_driver+0x10/0x10 __driver_probe_device+0x6e/0x110 driver_probe_device+0x1a/0x90 __device_attach_driver+0x7e/0xc0 bus_for_each_drv+0x7f/0xd0 __device_attach+0xaa/0x1a0 bus_probe_device+0x8b/0xa0 device_add+0x62e/0x810 usb_set_configuration+0x65d/0x990 usb_generic_driver_probe+0x4b/0x70 usb_probe_device+0x36/0xd0 The cause of this error is that the device has two interfaces, and the hub driver binds to interface 1 instead of interface 0, which is where usb_hub_to_struct_hub() looks. We can prevent the problem from occurring by refusing to accept hub devices that violate the USB spec by having more than one configuration or interface. Reported-and-tested-by: Robert Morris Cc: stable Closes: https://lore.kernel.org/linux-usb/95564.1737394039@localhost/ Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/c27f3bf4-63d8-4fb5-ac82-09e3cd19f61c@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index c3f839637cb5a..0cd44f1fd56d2 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1848,6 +1848,17 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id) desc = intf->cur_altsetting; hdev = interface_to_usbdev(intf); + /* + * The USB 2.0 spec prohibits hubs from having more than one + * configuration or interface, and we rely on this prohibition. + * Refuse to accept a device that violates it. + */ + if (hdev->descriptor.bNumConfigurations > 1 || + hdev->actconfig->desc.bNumInterfaces > 1) { + dev_err(&intf->dev, "Invalid hub with more than one config or interface\n"); + return -EINVAL; + } + /* * Set default autosuspend delay as 0 to speedup bus suspend, * based on the below considerations: -- GitLab From 58cd423820d5b5610977e55e4acdd06628829ede Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 24 Jan 2025 18:33:25 +0100 Subject: [PATCH 168/989] usb: dwc2: gadget: remove of_node reference upon udc_stop In dwc2_hsotg_udc_start(), e.g. when binding composite driver, "of_node" is set to hsotg->dev->of_node. It causes errors when binding the gadget driver several times, on stm32mp157c-ev1 board. Below error is seen: "pin PA10 already requested by 49000000.usb-otg; cannot claim for gadget.0" The first time, no issue is seen as when registering the driver, of_node isn't NULL: -> gadget_dev_desc_UDC_store -> usb_gadget_register_driver_owner -> driver_register ... -> really_probe -> pinctrl_bind_pins (no effect) Then dwc2_hsotg_udc_start() sets of_node. The second time (stop the gadget, reconfigure it, then start it again), of_node has been set, so the probing code tries to acquire pins for the gadget. These pins are hold by the controller, hence the error. So clear gadget.dev.of_node in udc_stop() routine to avoid the issue. Fixes: 7d7b22928b90 ("usb: gadget: s3c-hsotg: Propagate devicetree to gadget drivers") Cc: stable Signed-off-by: Fabrice Gasnier Link: https://lore.kernel.org/r/20250124173325.2747710-1-fabrice.gasnier@foss.st.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc2/gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index e7bf9cc635be6..bd4c788f03bc1 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -4615,6 +4615,7 @@ static int dwc2_hsotg_udc_stop(struct usb_gadget *gadget) spin_lock_irqsave(&hsotg->lock, flags); hsotg->driver = NULL; + hsotg->gadget.dev.of_node = NULL; hsotg->gadget.speed = USB_SPEED_UNKNOWN; hsotg->enabled = 0; -- GitLab From da1668997052ed1cb00322e1f3b63702615c9429 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Thu, 30 Jan 2025 19:50:34 +0000 Subject: [PATCH 169/989] usb: gadget: f_midi: fix MIDI Streaming descriptor lengths While the MIDI jacks are configured correctly, and the MIDIStreaming endpoint descriptors are filled with the correct information, bNumEmbMIDIJack and bLength are set incorrectly in these descriptors. This does not matter when the numbers of in and out ports are equal, but when they differ the host will receive broken descriptors with uninitialized stack memory leaking into the descriptor for whichever value is smaller. The precise meaning of "in" and "out" in the port counts is not clearly defined and can be confusing. But elsewhere the driver consistently uses this to match the USB meaning of IN and OUT viewed from the host, so that "in" ports send data to the host and "out" ports receive data from it. Cc: stable Fixes: c8933c3f79568 ("USB: gadget: f_midi: allow a dynamic number of input and output ports") Signed-off-by: John Keeping Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20250130195035.3883857-1-jkeeping@inmusicbrands.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 9b991cf5b0f8b..47260d65066a8 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -1009,11 +1009,11 @@ static int f_midi_bind(struct usb_configuration *c, struct usb_function *f) } /* configure the endpoint descriptors ... */ - ms_out_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->in_ports); - ms_out_desc.bNumEmbMIDIJack = midi->in_ports; + ms_out_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->out_ports); + ms_out_desc.bNumEmbMIDIJack = midi->out_ports; - ms_in_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->out_ports); - ms_in_desc.bNumEmbMIDIJack = midi->out_ports; + ms_in_desc.bLength = USB_DT_MS_ENDPOINT_SIZE(midi->in_ports); + ms_in_desc.bNumEmbMIDIJack = midi->in_ports; /* ... and add them to the list */ endpoint_descriptor_index = i; -- GitLab From 2255b40cacc2e5ef1b127770fc1808c60de4a2fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 24 Jan 2025 09:43:45 -0500 Subject: [PATCH 170/989] drm/amdgpu: add a BO metadata flag to disable write compression for Vulkan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vulkan can't support DCC and Z/S compression on GFX12 without WRITE_COMPRESS_DISABLE in this commit or a completely different DCC interface. AMDGPU_TILING_GFX12_SCANOUT is added because it's already used by userspace. Cc: stable@vger.kernel.org # 6.12.x Signed-off-by: Marek Olšák Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 2 ++ drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 5 +++-- include/uapi/drm/amdgpu_drm.h | 9 ++++++++- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 817116e53d440..dce9323fb410c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -119,9 +119,10 @@ * - 3.57.0 - Compute tunneling on GFX10+ * - 3.58.0 - Add GFX12 DCC support * - 3.59.0 - Cleared VRAM + * - 3.60.0 - Add AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE (Vulkan requirement) */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 59 +#define KMS_DRIVER_MINOR 60 #define KMS_DRIVER_PATCHLEVEL 0 /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index ff286940ab430..01ae2f88dec8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -309,7 +309,7 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, mutex_lock(&adev->mman.gtt_window_lock); while (src_mm.remaining) { uint64_t from, to, cur_size, tiling_flags; - uint32_t num_type, data_format, max_com; + uint32_t num_type, data_format, max_com, write_compress_disable; struct dma_fence *next; /* Never copy more than 256MiB at once to avoid a timeout */ @@ -340,9 +340,13 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev, max_com = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_MAX_COMPRESSED_BLOCK); num_type = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_NUMBER_TYPE); data_format = AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_DATA_FORMAT); + write_compress_disable = + AMDGPU_TILING_GET(tiling_flags, GFX12_DCC_WRITE_COMPRESS_DISABLE); copy_flags |= (AMDGPU_COPY_FLAGS_SET(MAX_COMPRESSED, max_com) | AMDGPU_COPY_FLAGS_SET(NUMBER_TYPE, num_type) | - AMDGPU_COPY_FLAGS_SET(DATA_FORMAT, data_format)); + AMDGPU_COPY_FLAGS_SET(DATA_FORMAT, data_format) | + AMDGPU_COPY_FLAGS_SET(WRITE_COMPRESS_DISABLE, + write_compress_disable)); } r = amdgpu_copy_buffer(ring, from, to, cur_size, resv, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 461fb8090ae04..208b7d1d8a277 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -119,6 +119,8 @@ struct amdgpu_copy_mem { #define AMDGPU_COPY_FLAGS_NUMBER_TYPE_MASK 0x07 #define AMDGPU_COPY_FLAGS_DATA_FORMAT_SHIFT 8 #define AMDGPU_COPY_FLAGS_DATA_FORMAT_MASK 0x3f +#define AMDGPU_COPY_FLAGS_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_COPY_FLAGS_WRITE_COMPRESS_DISABLE_MASK 0x1 #define AMDGPU_COPY_FLAGS_SET(field, value) \ (((__u32)(value) & AMDGPU_COPY_FLAGS_##field##_MASK) << AMDGPU_COPY_FLAGS_##field##_SHIFT) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 9c17df2cf37b8..7e10e94624e34 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1741,11 +1741,12 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, uint32_t byte_count, uint32_t copy_flags) { - uint32_t num_type, data_format, max_com; + uint32_t num_type, data_format, max_com, write_cm; max_com = AMDGPU_COPY_FLAGS_GET(copy_flags, MAX_COMPRESSED); data_format = AMDGPU_COPY_FLAGS_GET(copy_flags, DATA_FORMAT); num_type = AMDGPU_COPY_FLAGS_GET(copy_flags, NUMBER_TYPE); + write_cm = AMDGPU_COPY_FLAGS_GET(copy_flags, WRITE_COMPRESS_DISABLE) ? 2 : 1; ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) | SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | @@ -1762,7 +1763,7 @@ static void sdma_v7_0_emit_copy_buffer(struct amdgpu_ib *ib, if ((copy_flags & (AMDGPU_COPY_FLAGS_READ_DECOMPRESSED | AMDGPU_COPY_FLAGS_WRITE_COMPRESSED))) ib->ptr[ib->length_dw++] = SDMA_DCC_DATA_FORMAT(data_format) | SDMA_DCC_NUM_TYPE(num_type) | ((copy_flags & AMDGPU_COPY_FLAGS_READ_DECOMPRESSED) ? SDMA_DCC_READ_CM(2) : 0) | - ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? SDMA_DCC_WRITE_CM(1) : 0) | + ((copy_flags & AMDGPU_COPY_FLAGS_WRITE_COMPRESSED) ? SDMA_DCC_WRITE_CM(write_cm) : 0) | SDMA_DCC_MAX_COM(max_com) | SDMA_DCC_MAX_UCOM(1); else ib->ptr[ib->length_dw++] = 0; diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index efe5de6ce208a..aaa4f3bc688b5 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -411,13 +411,20 @@ struct drm_amdgpu_gem_userptr { /* GFX12 and later: */ #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 #define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 -/* These are DCC recompression setting for memory management: */ +/* These are DCC recompression settings for memory management: */ #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 #define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 #define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 #define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 /* Set/Get helpers for tiling flags. */ #define AMDGPU_TILING_SET(field, value) \ -- GitLab From 8adbb2a98b00926315fd513b5fe2596b5716b82d Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Fri, 17 Jan 2025 12:37:11 -0700 Subject: [PATCH 171/989] drm/amd/display: Fix out-of-bound accesses [WHAT & HOW] hpo_stream_to_link_encoder_mapping has size MAX_HPO_DP2_ENCODERS(=4), but location can have size up to 6. As a result, it is necessary to check location against MAX_HPO_DP2_ENCODERS. Similiarly, disp_cfg_stream_location can be used as an array index which should be 0..5, so the ASSERT's conditions should be less without equal. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3904 Reviewed-by: Austin Zheng Reviewed-by: Rodrigo Siqueira Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/dml2/dml21/dml21_translation_helper.c | 4 ++-- .../gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c index b9c6b45f6872d..0c8ec30ea6726 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c @@ -1017,7 +1017,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s if (disp_cfg_stream_location < 0) disp_cfg_stream_location = dml_dispcfg->num_streams++; - ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml21_timing_config_from_stream_state(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].timing, context->streams[stream_index], dml_ctx); adjust_dml21_hblank_timing_config_from_pipe_ctx(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].timing, &context->res_ctx.pipe_ctx[stream_index]); populate_dml21_output_config_from_stream_state(&dml_dispcfg->stream_descriptors[disp_cfg_stream_location].output, context->streams[stream_index], &context->res_ctx.pipe_ctx[stream_index]); @@ -1042,7 +1042,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s if (disp_cfg_plane_location < 0) disp_cfg_plane_location = dml_dispcfg->num_planes++; - ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml21_surface_config_from_plane_state(in_dc, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location].surface, context->stream_status[stream_index].plane_states[plane_index]); populate_dml21_plane_config_from_plane_state(dml_ctx, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location], context->stream_status[stream_index].plane_states[plane_index], context, stream_index); diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index b416320873e11..b8a34abaf519a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -786,7 +786,7 @@ static void populate_dml_output_cfg_from_stream_state(struct dml_output_cfg_st * case SIGNAL_TYPE_DISPLAY_PORT_MST: case SIGNAL_TYPE_DISPLAY_PORT: out->OutputEncoder[location] = dml_dp; - if (dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location] != -1) + if (location < MAX_HPO_DP2_ENCODERS && dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location] != -1) out->OutputEncoder[dml2->v20.scratch.hpo_stream_to_link_encoder_mapping[location]] = dml_dp2p0; break; case SIGNAL_TYPE_EDP: @@ -1343,7 +1343,7 @@ void map_dc_state_into_dml_display_cfg(struct dml2_context *dml2, struct dc_stat if (disp_cfg_stream_location < 0) disp_cfg_stream_location = dml_dispcfg->num_timings++; - ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_stream_location >= 0 && disp_cfg_stream_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml_timing_cfg_from_stream_state(&dml_dispcfg->timing, disp_cfg_stream_location, context->streams[i]); populate_dml_output_cfg_from_stream_state(&dml_dispcfg->output, disp_cfg_stream_location, context->streams[i], current_pipe_context, dml2); @@ -1383,7 +1383,7 @@ void map_dc_state_into_dml_display_cfg(struct dml2_context *dml2, struct dc_stat if (disp_cfg_plane_location < 0) disp_cfg_plane_location = dml_dispcfg->num_surfaces++; - ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location <= __DML2_WRAPPER_MAX_STREAMS_PLANES__); + ASSERT(disp_cfg_plane_location >= 0 && disp_cfg_plane_location < __DML2_WRAPPER_MAX_STREAMS_PLANES__); populate_dml_surface_cfg_from_plane_state(dml2->v20.dml_core_ctx.project, &dml_dispcfg->surface, disp_cfg_plane_location, context->stream_status[i].plane_states[j]); populate_dml_plane_cfg_from_plane_state( -- GitLab From e01f07cb92513ca4b9b219ab9caa34d607bc1e2d Mon Sep 17 00:00:00 2001 From: Lo-an Chen Date: Fri, 17 Jan 2025 17:56:25 +0800 Subject: [PATCH 172/989] drm/amd/display: Fix seamless boot sequence [WHY] When the system powers up eDP with external monitors in seamless boot sequence, stutter get enabled before TTU and HUBP registers being programmed, which resulting in underflow. [HOW] Enable TTU in hubp_init. Change the sequence that do not perpare_bandwidth and optimize_bandwidth while having seamless boot streams. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Nicholas Kazlauskas Signed-off-by: Lo-an Chen Signed-off-by: Paul Hsieh Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 2 +- drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c | 3 ++- drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c | 3 ++- drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c | 3 ++- drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c | 3 ++- drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c | 2 ++ drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c | 2 ++ drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 3 ++- 8 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index cecaadf741ad0..f84e795e35f58 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2133,7 +2133,7 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c dc_enable_stereo(dc, context, dc_streams, context->stream_count); - if (context->stream_count > get_seamless_boot_stream_count(context) || + if (get_seamless_boot_stream_count(context) == 0 || context->stream_count == 0) { /* Must wait for no flips to be pending before doing optimize bw */ hwss_wait_for_no_pipes_pending(dc, context); diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c index fe741100c0f88..d347bb06577ac 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn30/dcn30_hubbub.c @@ -129,7 +129,8 @@ bool hubbub3_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF); - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); return wm_pending; } diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c index 7fb5523f97224..b98505b240a79 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn31/dcn31_hubbub.c @@ -750,7 +750,8 @@ static bool hubbub31_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF);*/ - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); return wm_pending; } diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c index 5264dc26cce1f..32a6be543105c 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn32/dcn32_hubbub.c @@ -786,7 +786,8 @@ static bool hubbub32_program_watermarks( REG_UPDATE(DCHUBBUB_ARB_DF_REQ_OUTSTAND, DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 0x1FF);*/ - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); hubbub32_force_usr_retraining_allow(hubbub, hubbub->ctx->dc->debug.force_usr_allow); diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c index 5eb3da8d5206e..dce7269959ce7 100644 --- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c +++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c @@ -326,7 +326,8 @@ static bool hubbub35_program_watermarks( DCHUBBUB_ARB_MIN_REQ_OUTSTAND_COMMIT_THRESHOLD, 0xA);/*hw delta*/ REG_UPDATE(DCHUBBUB_ARB_HOSTVM_CNTL, DCHUBBUB_ARB_MAX_QOS_COMMIT_THRESHOLD, 0xF); - hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); + if (safe_to_lower || hubbub->ctx->dc->debug.disable_stutter) + hubbub1_allow_self_refresh_control(hubbub, !hubbub->ctx->dc->debug.disable_stutter); hubbub32_force_usr_retraining_allow(hubbub, hubbub->ctx->dc->debug.force_usr_allow); diff --git a/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c b/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c index be0ac613675a2..0da70b50e86d4 100644 --- a/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/hubp/dcn30/dcn30_hubp.c @@ -500,6 +500,8 @@ void hubp3_init(struct hubp *hubp) //hubp[i].HUBPREQ_DEBUG.HUBPREQ_DEBUG[26] = 1; REG_WRITE(HUBPREQ_DEBUG, 1 << 26); + REG_UPDATE(DCHUBP_CNTL, HUBP_TTU_DISABLE, 0); + hubp_reset(hubp); } diff --git a/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c b/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c index edd37898d5500..f3a21c623f441 100644 --- a/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/hubp/dcn32/dcn32_hubp.c @@ -168,6 +168,8 @@ void hubp32_init(struct hubp *hubp) { struct dcn20_hubp *hubp2 = TO_DCN20_HUBP(hubp); REG_WRITE(HUBPREQ_DEBUG_DB, 1 << 8); + + REG_UPDATE(DCHUBP_CNTL, HUBP_TTU_DISABLE, 0); } static struct hubp_funcs dcn32_hubp_funcs = { .hubp_enable_tripleBuffer = hubp2_enable_triplebuffer, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index 623cde76debfb..b907ad1acedd9 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -236,7 +236,8 @@ void dcn35_init_hw(struct dc *dc) } hws->funcs.init_pipes(dc, dc->current_state); - if (dc->res_pool->hubbub->funcs->allow_self_refresh_control) + if (dc->res_pool->hubbub->funcs->allow_self_refresh_control && + !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter) dc->res_pool->hubbub->funcs->allow_self_refresh_control(dc->res_pool->hubbub, !dc->res_pool->hubbub->ctx->dc->debug.disable_stutter); } -- GitLab From 588c20079e17dae9e1f49ba42981a05de1c9136e Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 16 Jan 2025 19:21:55 -0800 Subject: [PATCH 173/989] drm/xe/oa: Preserve oa_ctrl unused bits UMD's have interest in setting unused bits of the oa_ctrl register "out of band" for certain experiments. To facilitate this, don't clobber previous oa_ctrl unused bits, i.e. rmw the values rather than simply write them. Fixes: e936f885f1e9 ("drm/xe/oa/uapi: Expose OA stream fd") Signed-off-by: Ashutosh Dixit Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20250117032155.3048063-1-ashutosh.dixit@intel.com (cherry picked from commit cfa9d40db8c30d894171010fe765d96e9bc6a47e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_oa_regs.h | 6 ++++++ drivers/gpu/drm/xe/xe_oa.c | 12 ++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h index a49561e9f3c31..a79ad2da070c2 100644 --- a/drivers/gpu/drm/xe/regs/xe_oa_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h @@ -51,6 +51,10 @@ /* Common to all OA units */ #define OA_OACONTROL_REPORT_BC_MASK REG_GENMASK(9, 9) #define OA_OACONTROL_COUNTER_SIZE_MASK REG_GENMASK(8, 8) +#define OAG_OACONTROL_USED_BITS \ + (OAG_OACONTROL_OA_PES_DISAG_EN | OAG_OACONTROL_OA_CCS_SELECT_MASK | \ + OAG_OACONTROL_OA_COUNTER_SEL_MASK | OAG_OACONTROL_OA_COUNTER_ENABLE | \ + OA_OACONTROL_REPORT_BC_MASK | OA_OACONTROL_COUNTER_SIZE_MASK) #define OAG_OA_DEBUG XE_REG(0xdaf8, XE_REG_OPTION_MASKED) #define OAG_OA_DEBUG_DISABLE_MMIO_TRG REG_BIT(14) @@ -78,6 +82,8 @@ #define OAM_CONTEXT_CONTROL_OFFSET (0x1bc) #define OAM_CONTROL_OFFSET (0x194) #define OAM_CONTROL_COUNTER_SEL_MASK REG_GENMASK(3, 1) +#define OAM_OACONTROL_USED_BITS \ + (OAM_CONTROL_COUNTER_SEL_MASK | OAG_OACONTROL_OA_COUNTER_ENABLE) #define OAM_DEBUG_OFFSET (0x198) #define OAM_STATUS_OFFSET (0x19c) #define OAM_MMIO_TRG_OFFSET (0x1d0) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index eeb96b5f49e2a..6a08e6c92835f 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -452,6 +452,12 @@ static u32 __oa_ccs_select(struct xe_oa_stream *stream) return val; } +static u32 __oactrl_used_bits(struct xe_oa_stream *stream) +{ + return stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG ? + OAG_OACONTROL_USED_BITS : OAM_OACONTROL_USED_BITS; +} + static void xe_oa_enable(struct xe_oa_stream *stream) { const struct xe_oa_format *format = stream->oa_buffer.format; @@ -472,14 +478,14 @@ static void xe_oa_enable(struct xe_oa_stream *stream) stream->hwe->oa_unit->type == DRM_XE_OA_UNIT_TYPE_OAG) val |= OAG_OACONTROL_OA_PES_DISAG_EN; - xe_mmio_write32(&stream->gt->mmio, regs->oa_ctrl, val); + xe_mmio_rmw32(&stream->gt->mmio, regs->oa_ctrl, __oactrl_used_bits(stream), val); } static void xe_oa_disable(struct xe_oa_stream *stream) { struct xe_mmio *mmio = &stream->gt->mmio; - xe_mmio_write32(mmio, __oa_regs(stream)->oa_ctrl, 0); + xe_mmio_rmw32(mmio, __oa_regs(stream)->oa_ctrl, __oactrl_used_bits(stream), 0); if (xe_mmio_wait32(mmio, __oa_regs(stream)->oa_ctrl, OAG_OACONTROL_OA_COUNTER_ENABLE, 0, 50000, NULL, false)) drm_err(&stream->oa->xe->drm, @@ -2534,6 +2540,8 @@ static void __xe_oa_init_oa_units(struct xe_gt *gt) u->type = DRM_XE_OA_UNIT_TYPE_OAM; } + xe_mmio_write32(>->mmio, u->regs.oa_ctrl, 0); + /* Ensure MMIO trigger remains disabled till there is a stream */ xe_mmio_write32(>->mmio, u->regs.oa_debug, oag_configure_mmio_trigger(NULL, false)); -- GitLab From 9f706fd8024208b0686bb8ec68589d758f765672 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Tue, 21 Jan 2025 00:24:43 +0100 Subject: [PATCH 174/989] drm/xe/pf: Fix migration initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The migration support only needs to be initialized once, but it was incorrectly called from the xe_gt_sriov_pf_init_hw(), which is part of the reset flow and may be called multiple times. Fixes: d86e3737c7ab ("drm/xe/pf: Add functions to save and restore VF GuC state") Signed-off-by: Michal Wajdeczko Cc: Michał Winiarski Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250120232443.544-1-michal.wajdeczko@intel.com (cherry picked from commit 9ebb5846e1a3b1705f8a7cbc528888a1aa0b163e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt.c | 4 +++- drivers/gpu/drm/xe/xe_gt_sriov_pf.c | 14 +++++++++++++- drivers/gpu/drm/xe/xe_gt_sriov_pf.h | 6 ++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 26e64530ada27..5d6fb79957b63 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -532,8 +532,10 @@ static int all_fw_domain_init(struct xe_gt *gt) if (IS_SRIOV_PF(gt_to_xe(gt)) && !xe_gt_is_media_type(gt)) xe_lmtt_init_hw(>_to_tile(gt)->sriov.pf.lmtt); - if (IS_SRIOV_PF(gt_to_xe(gt))) + if (IS_SRIOV_PF(gt_to_xe(gt))) { + xe_gt_sriov_pf_init(gt); xe_gt_sriov_pf_init_hw(gt); + } xe_force_wake_put(gt_to_fw(gt), fw_ref); diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c index e71fc3d2bda22..6f906c8e8108b 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.c @@ -68,6 +68,19 @@ int xe_gt_sriov_pf_init_early(struct xe_gt *gt) return 0; } +/** + * xe_gt_sriov_pf_init - Prepare SR-IOV PF data structures on PF. + * @gt: the &xe_gt to initialize + * + * Late one-time initialization of the PF data. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_gt_sriov_pf_init(struct xe_gt *gt) +{ + return xe_gt_sriov_pf_migration_init(gt); +} + static bool pf_needs_enable_ggtt_guest_update(struct xe_device *xe) { return GRAPHICS_VERx100(xe) == 1200; @@ -90,7 +103,6 @@ void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) pf_enable_ggtt_guest_update(gt); xe_gt_sriov_pf_service_update(gt); - xe_gt_sriov_pf_migration_init(gt); } static u32 pf_get_vf_regs_stride(struct xe_device *xe) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h index 96fab779a906f..f474509411c0c 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf.h +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf.h @@ -10,6 +10,7 @@ struct xe_gt; #ifdef CONFIG_PCI_IOV int xe_gt_sriov_pf_init_early(struct xe_gt *gt); +int xe_gt_sriov_pf_init(struct xe_gt *gt); void xe_gt_sriov_pf_init_hw(struct xe_gt *gt); void xe_gt_sriov_pf_sanitize_hw(struct xe_gt *gt, unsigned int vfid); void xe_gt_sriov_pf_restart(struct xe_gt *gt); @@ -19,6 +20,11 @@ static inline int xe_gt_sriov_pf_init_early(struct xe_gt *gt) return 0; } +static inline int xe_gt_sriov_pf_init(struct xe_gt *gt) +{ + return 0; +} + static inline void xe_gt_sriov_pf_init_hw(struct xe_gt *gt) { } -- GitLab From 990d35edc5d333ca6cd3acfdfc13683dc5bb105f Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Wed, 15 Jan 2025 14:20:29 -0800 Subject: [PATCH 175/989] drm/xe/oa: Set stream->pollin in xe_oa_buffer_check_unlocked We rely on stream->pollin to decide whether or not to block during poll/read calls. However, currently there are blocking read code paths which don't even set stream->pollin. The best place to consistently set stream->pollin for all code paths is therefore to set it in xe_oa_buffer_check_unlocked. Fixes: e936f885f1e9 ("drm/xe/oa/uapi: Expose OA stream fd") Signed-off-by: Ashutosh Dixit Acked-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20250115222029.3002103-1-ashutosh.dixit@intel.com (cherry picked from commit d3fedff828bb7e4a422c42caeafd5d974e24ee43) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_oa.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 6a08e6c92835f..fa873f3d0a9d1 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -237,7 +237,6 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) u32 tail, hw_tail, partial_report_size, available; int report_size = stream->oa_buffer.format->size; unsigned long flags; - bool pollin; spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); @@ -282,11 +281,11 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) stream->oa_buffer.tail = tail; available = xe_oa_circ_diff(stream, stream->oa_buffer.tail, stream->oa_buffer.head); - pollin = available >= stream->wait_num_reports * report_size; + stream->pollin = available >= stream->wait_num_reports * report_size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); - return pollin; + return stream->pollin; } static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) @@ -294,10 +293,8 @@ static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) struct xe_oa_stream *stream = container_of(hrtimer, typeof(*stream), poll_check_timer); - if (xe_oa_buffer_check_unlocked(stream)) { - stream->pollin = true; + if (xe_oa_buffer_check_unlocked(stream)) wake_up(&stream->poll_wq); - } hrtimer_forward_now(hrtimer, ns_to_ktime(stream->poll_period_ns)); -- GitLab From 042c48b73699c47d84b6ace73036e5a31a0d4cfc Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 22 Jan 2025 21:11:11 -0800 Subject: [PATCH 176/989] drm/xe/devcoredump: Move exec queue snapshot to Contexts section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having the exec queue snapshot inside a "GuC CT" section was always wrong. Commit c28fd6c358db ("drm/xe/devcoredump: Improve section headings and add tile info") tried to fix that bug, but with that also broke the mesa tool that parses the devcoredump, hence it was reverted in commit a53da2fb25a3 ("drm/xe: Revert some changes that break a mesa debug tool"). With the mesa tool also fixed, this can propagate as a fix on both kernel and userspace side to avoid unnecessary headache for a debug feature. Cc: John Harrison Cc: Julia Filipchuk Cc: José Roberto de Souza Cc: stable@vger.kernel.org Fixes: a53da2fb25a3 ("drm/xe: Revert some changes that break a mesa debug tool") Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20250123051112.1938193-2-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit a37934ea75d331fafa7fe80b6180642ba5193422) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_devcoredump.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index 81dc7795c0651..a7946a76777e7 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -119,11 +119,7 @@ static ssize_t __xe_devcoredump_read(char *buffer, size_t count, drm_puts(&p, "\n**** GuC CT ****\n"); xe_guc_ct_snapshot_print(ss->guc.ct, &p); - /* - * Don't add a new section header here because the mesa debug decoder - * tool expects the context information to be in the 'GuC CT' section. - */ - /* drm_puts(&p, "\n**** Contexts ****\n"); */ + drm_puts(&p, "\n**** Contexts ****\n"); xe_guc_exec_queue_snapshot_print(ss->ge, &p); drm_puts(&p, "\n**** Job ****\n"); -- GitLab From a9ab6591b45258b79af1cb66112fd9f83c8855da Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 23 Jan 2025 12:22:03 -0800 Subject: [PATCH 177/989] drm/xe: Fix and re-enable xe_print_blob_ascii85() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa debug tool") partially reverted some changes to workaround breakage caused to mesa tools. However, in doing so it also broke fetching the GuC log via debugfs since xe_print_blob_ascii85() simply bails out. The fix is to avoid the extra newlines: the devcoredump interface is line-oriented and adding random newlines in the middle breaks it. If a tool is able to parse it by looking at the data and checking for chars that are out of the ascii85 space, it can still do so. A format change that breaks the line-oriented output on devcoredump however needs better coordination with existing tools. v2: Add suffix description comment v3: Reword explanation of xe_print_blob_ascii85() calling drm_puts() in a loop Reviewed-by: José Roberto de Souza Cc: John Harrison Cc: Julia Filipchuk Cc: José Roberto de Souza Cc: stable@vger.kernel.org Fixes: 70fb86a85dc9 ("drm/xe: Revert some changes that break a mesa debug tool") Fixes: ec1455ce7e35 ("drm/xe/devcoredump: Add ASCII85 dump helper function") Link: https://patchwork.freedesktop.org/patch/msgid/20250123202307.95103-2-jose.souza@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 2c95bbf5002776117a69caed3b31c10bf7341bec) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_devcoredump.c | 34 +++++++++++------------------ drivers/gpu/drm/xe/xe_devcoredump.h | 2 +- drivers/gpu/drm/xe/xe_guc_ct.c | 3 ++- drivers/gpu/drm/xe/xe_guc_log.c | 4 +++- 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c index a7946a76777e7..39fe485d20858 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.c +++ b/drivers/gpu/drm/xe/xe_devcoredump.c @@ -391,42 +391,34 @@ int xe_devcoredump_init(struct xe_device *xe) /** * xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85 * - * The output is split to multiple lines because some print targets, e.g. dmesg - * cannot handle arbitrarily long lines. Note also that printing to dmesg in - * piece-meal fashion is not possible, each separate call to drm_puts() has a - * line-feed automatically added! Therefore, the entire output line must be - * constructed in a local buffer first, then printed in one atomic output call. + * The output is split into multiple calls to drm_puts() because some print + * targets, e.g. dmesg, cannot handle arbitrarily long lines. These targets may + * add newlines, as is the case with dmesg: each drm_puts() call creates a + * separate line. * * There is also a scheduler yield call to prevent the 'task has been stuck for * 120s' kernel hang check feature from firing when printing to a slow target * such as dmesg over a serial port. * - * TODO: Add compression prior to the ASCII85 encoding to shrink huge buffers down. - * * @p: the printer object to output to * @prefix: optional prefix to add to output string + * @suffix: optional suffix to add at the end. 0 disables it and is + * not added to the output, which is useful when using multiple calls + * to dump data to @p * @blob: the Binary Large OBject to dump out * @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32) * @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32) */ -void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix, const void *blob, size_t offset, size_t size) { const u32 *blob32 = (const u32 *)blob; char buff[ASCII85_BUFSZ], *line_buff; size_t line_pos = 0; - /* - * Splitting blobs across multiple lines is not compatible with the mesa - * debug decoder tool. Note that even dropping the explicit '\n' below - * doesn't help because the GuC log is so big some underlying implementation - * still splits the lines at 512K characters. So just bail completely for - * the moment. - */ - return; - #define DMESG_MAX_LINE_LEN 800 -#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\n\0" */ + /* Always leave space for the suffix char and the \0 */ +#define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "\0" */ if (size & 3) drm_printf(p, "Size not word aligned: %zu", size); @@ -458,7 +450,6 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, line_pos += strlen(line_buff + line_pos); if ((line_pos + MIN_SPACE) >= DMESG_MAX_LINE_LEN) { - line_buff[line_pos++] = '\n'; line_buff[line_pos++] = 0; drm_puts(p, line_buff); @@ -470,10 +461,11 @@ void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, } } + if (suffix) + line_buff[line_pos++] = suffix; + if (line_pos) { - line_buff[line_pos++] = '\n'; line_buff[line_pos++] = 0; - drm_puts(p, line_buff); } diff --git a/drivers/gpu/drm/xe/xe_devcoredump.h b/drivers/gpu/drm/xe/xe_devcoredump.h index 6a17e6d601022..5391a80a4d1ba 100644 --- a/drivers/gpu/drm/xe/xe_devcoredump.h +++ b/drivers/gpu/drm/xe/xe_devcoredump.h @@ -29,7 +29,7 @@ static inline int xe_devcoredump_init(struct xe_device *xe) } #endif -void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, +void xe_print_blob_ascii85(struct drm_printer *p, const char *prefix, char suffix, const void *blob, size_t offset, size_t size); #endif diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 8b65c5e959cc2..50c8076b51585 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1724,7 +1724,8 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, snapshot->g2h_outstanding); if (snapshot->ctb) - xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size); + xe_print_blob_ascii85(p, "CTB data", '\n', + snapshot->ctb, 0, snapshot->ctb_size); } else { drm_puts(p, "CT disabled\n"); } diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index df4cfb698cdbc..2baa4d95571fb 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -211,8 +211,10 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_ remain = snapshot->size; for (i = 0; i < snapshot->num_chunks; i++) { size_t size = min(GUC_LOG_CHUNK_SIZE, remain); + const char *prefix = i ? NULL : "Log data"; + char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0; - xe_print_blob_ascii85(p, i ? NULL : "Log data", snapshot->copy[i], 0, size); + xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size); remain -= size; } } -- GitLab From d275a5e0c5f528b4b877ec683b8cd8bfced96af5 Mon Sep 17 00:00:00 2001 From: Devarsh Thakkar Date: Mon, 3 Feb 2025 21:24:31 +0530 Subject: [PATCH 178/989] dt-bindings: display: ti: Fix compatible for am62a7 dss Fix incorrect format of compatible string (comma instead of hyphen) for TI's AM62A7 SoC. s/ti,am62a7,dss/ti,am62a7-dss Fixes: 7959ceb767e4 ("dt-bindings: display: ti: Add support for am62a7 dss") Reviewed-by: Krzysztof Kozlowski Signed-off-by: Devarsh Thakkar Link: https://lore.kernel.org/r/20250203155431.2174170-1-devarsht@ti.com Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml index 55e3e490d0e61..31c4ffcb599cd 100644 --- a/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml +++ b/Documentation/devicetree/bindings/display/ti/ti,am65x-dss.yaml @@ -23,7 +23,7 @@ properties: compatible: enum: - ti,am625-dss - - ti,am62a7,dss + - ti,am62a7-dss - ti,am65x-dss reg: -- GitLab From 1ddee69108d305bbc059cbf31c0b47626796be77 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 17 Jan 2025 16:21:45 +0200 Subject: [PATCH 179/989] pinctrl: cy8c95x0: Respect IRQ trigger settings from firmware Some of the platforms may connect the INT pin via inversion logic effectively make the triggering to be active-low. Remove explicit trigger flag to respect the settings from firmware. Without this change even idling chip produces spurious interrupts and kernel disables the line in the result: irq 33: nobody cared (try booting with the "irqpoll" option) CPU: 0 UID: 0 PID: 125 Comm: irq/33-i2c-INT3 Not tainted 6.12.0-00236-g8b874ed11dae #64 Hardware name: Intel Corp. QUARK/Galileo, BIOS 0x01000900 01/01/2014 ... handlers: [<86e86bea>] irq_default_primary_handler threaded [] cy8c95x0_irq_handler [pinctrl_cy8c95x0] Disabling IRQ #33 Fixes: e6cbbe42944d ("pinctrl: Add Cypress cy8c95x0 support") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250117142304.596106-2-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index 75100a9fb8e4c..d73004b4a45e7 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -1355,7 +1355,7 @@ static int cy8c95x0_irq_setup(struct cy8c95x0_pinctrl *chip, int irq) ret = devm_request_threaded_irq(chip->dev, irq, NULL, cy8c95x0_irq_handler, - IRQF_ONESHOT | IRQF_SHARED | IRQF_TRIGGER_HIGH, + IRQF_ONESHOT | IRQF_SHARED, dev_name(chip->dev), chip); if (ret) { dev_err(chip->dev, "failed to request irq %d\n", irq); -- GitLab From 902e09c8acde117b00369521f54df817a983d4ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Feb 2025 16:16:09 -0500 Subject: [PATCH 180/989] fix braino in "9p: fix ->rename_sem exclusion" ->d_op can bloody well be NULL Fucked-up-by: Al Viro Fixes: 30d61efe118c "9p: fix ->rename_sem exclusion" Signed-off-by: Al Viro --- fs/dcache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 903142b324e98..8a605681b26ff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2967,11 +2967,11 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: - if (alias->d_op->d_unalias_trylock && + if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); - if (alias->d_op->d_unalias_unlock) + if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: -- GitLab From 3a4e7193ec37ee2476ce726589de4495a066b565 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 1 Feb 2025 16:50:24 -0800 Subject: [PATCH 181/989] MAINTAINERS: list openvswitch docs under its entry Submissions to the docs seem to not get properly CCed. Acked-by: Ilya Maximets Link: https://patch.msgid.link/20250202005024.964262-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d1086e53a3176..c7b8c6535a1e2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17706,6 +17706,7 @@ L: netdev@vger.kernel.org L: dev@openvswitch.org S: Maintained W: http://openvswitch.org +F: Documentation/networking/openvswitch.rst F: include/uapi/linux/openvswitch.h F: net/openvswitch/ F: tools/testing/selftests/net/openvswitch/ -- GitLab From 4d896b35394144c246daaeb5280a015a630958e7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 1 Feb 2025 17:47:26 -0800 Subject: [PATCH 182/989] MAINTAINERS: add Kuniyuki Iwashima to TCP reviewers List Kuniyuki as an official TCP reviewer. Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250202014728.1005003-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c7b8c6535a1e2..48677d61c97bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16614,6 +16614,7 @@ F: tools/testing/selftests/net/mptcp/ NETWORKING [TCP] M: Eric Dumazet M: Neal Cardwell +R: Kuniyuki Iwashima L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/net_cachelines/tcp_sock.rst -- GitLab From ae0585b04ab741b536b0db20c12baf24bf7118d2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 1 Feb 2025 17:47:27 -0800 Subject: [PATCH 183/989] MAINTAINERS: add a general entry for BSD sockets Create a MAINTAINERS entry for BSD sockets. List the top 3 reviewers as maintainers. The entry is meant to cover core socket code (of which there isn't much) but also reviews of any new socket families. Reviewed-by: Simon Horman Acked-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250202014728.1005003-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 48677d61c97bd..438d85bb97a22 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16642,6 +16642,22 @@ F: include/net/tls.h F: include/uapi/linux/tls.h F: net/tls/* +NETWORKING [SOCKETS] +M: Eric Dumazet +M: Kuniyuki Iwashima +M: Paolo Abeni +M: Willem de Bruijn +S: Maintained +F: include/linux/sock_diag.h +F: include/linux/socket.h +F: include/linux/sockptr.h +F: include/net/sock.h +F: include/net/sock_reuseport.h +F: include/uapi/linux/socket.h +F: net/core/*sock* +F: net/core/scm.c +F: net/socket.c + NETXEN (1/10) GbE SUPPORT M: Manish Chopra M: Rahul Verma -- GitLab From 8a2e22f665a0b5c212057031e94b75cfdc11a4a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 1 Feb 2025 17:47:28 -0800 Subject: [PATCH 184/989] MAINTAINERS: add entry for UNIX sockets Add a MAINTAINERS entry for UNIX socket, Kuniyuki has been the de-facto maintainer of this code for a while. Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250202014728.1005003-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 438d85bb97a22..74b09dad46626 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16658,6 +16658,15 @@ F: net/core/*sock* F: net/core/scm.c F: net/socket.c +NETWORKING [UNIX SOCKETS] +M: Kuniyuki Iwashima +S: Maintained +F: include/net/af_unix.h +F: include/net/netns/unix.h +F: include/uapi/linux/unix_diag.h +F: net/unix/ +F: tools/testing/selftests/net/af_unix/ + NETXEN (1/10) GbE SUPPORT M: Manish Chopra M: Rahul Verma -- GitLab From 45ab5166a82d038c898985b0ad43ead69c1f9573 Mon Sep 17 00:00:00 2001 From: Charles Han Date: Thu, 14 Nov 2024 17:19:47 +0800 Subject: [PATCH 185/989] HID: winwing: Add NULL check in winwing_init_led() devm_kasprintf() can return a NULL pointer on failure,but this returned value in winwing_init_led() is not checked. Add NULL check in winwing_init_led(), to handle kernel NULL pointer dereference error. Fixes: 266c990debad ("HID: Add WinWing Orion2 throttle support") Signed-off-by: Charles Han Signed-off-by: Jiri Kosina --- drivers/hid/hid-winwing.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hid/hid-winwing.c b/drivers/hid/hid-winwing.c index 831b760c66ea7..d4afbbd278079 100644 --- a/drivers/hid/hid-winwing.c +++ b/drivers/hid/hid-winwing.c @@ -106,6 +106,8 @@ static int winwing_init_led(struct hid_device *hdev, "%s::%s", dev_name(&input->dev), info->led_name); + if (!led->cdev.name) + return -ENOMEM; ret = devm_led_classdev_register(&hdev->dev, &led->cdev); if (ret) -- GitLab From 9b8e2220d3a052a690b1d1b23019673e612494c5 Mon Sep 17 00:00:00 2001 From: Charles Han Date: Fri, 15 Nov 2024 14:26:21 +0800 Subject: [PATCH 186/989] HID: multitouch: Add NULL check in mt_input_configured devm_kasprintf() can return a NULL pointer on failure,but this returned value in mt_input_configured() is not checked. Add NULL check in mt_input_configured(), to handle kernel NULL pointer dereference error. Fixes: 479439463529 ("HID: multitouch: Correct devm device reference for hidinput input_dev name") Signed-off-by: Charles Han Signed-off-by: Jiri Kosina --- drivers/hid/hid-multitouch.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 82900857bfd87..e50887a6d22c2 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1679,9 +1679,12 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi) break; } - if (suffix) + if (suffix) { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); + if (!hi->input->name) + return -ENOMEM; + } return 0; } -- GitLab From 3d4114a1d34413dfffa0094c2eb7b95e61087abd Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Tue, 28 Jan 2025 09:12:06 +0200 Subject: [PATCH 187/989] scsi: ufs: core: Ensure clk_gating.lock is used only after initialization Address a lockdep warning triggered by the use of the clk_gating.lock before it is properly initialized. The warning is as follows: [ 4.388838] INFO: trying to register non-static key. [ 4.395673] The code is fine but needs lockdep annotation, or maybe [ 4.402118] you didn't initialize this object before use? [ 4.407673] turning off the locking correctness validator. [ 4.413334] CPU: 5 UID: 0 PID: 58 Comm: kworker/u32:1 Not tainted 6.12-rc1 #185 [ 4.413343] Hardware name: Qualcomm Technologies, Inc. Robotics RB5 (DT) [ 4.413362] Call trace: [ 4.413364] show_stack+0x18/0x24 (C) [ 4.413374] dump_stack_lvl+0x90/0xd0 [ 4.413384] dump_stack+0x18/0x24 [ 4.413392] register_lock_class+0x498/0x4a8 [ 4.413400] __lock_acquire+0xb4/0x1b90 [ 4.413406] lock_acquire+0x114/0x310 [ 4.413413] _raw_spin_lock_irqsave+0x60/0x88 [ 4.413423] ufshcd_setup_clocks+0x2c0/0x490 [ 4.413433] ufshcd_init+0x198/0x10ec [ 4.413437] ufshcd_pltfrm_init+0x600/0x7c0 [ 4.413444] ufs_qcom_probe+0x20/0x58 [ 4.413449] platform_probe+0x68/0xd8 [ 4.413459] really_probe+0xbc/0x268 [ 4.413466] __driver_probe_device+0x78/0x12c [ 4.413473] driver_probe_device+0x40/0x11c [ 4.413481] __device_attach_driver+0xb8/0xf8 [ 4.413489] bus_for_each_drv+0x84/0xe4 [ 4.413495] __device_attach+0xfc/0x18c [ 4.413502] device_initial_probe+0x14/0x20 [ 4.413510] bus_probe_device+0xb0/0xb4 [ 4.413517] deferred_probe_work_func+0x8c/0xc8 [ 4.413524] process_scheduled_works+0x250/0x658 [ 4.413534] worker_thread+0x15c/0x2c8 [ 4.413542] kthread+0x134/0x200 [ 4.413550] ret_from_fork+0x10/0x20 To fix this issue, ensure that the spinlock is only used after it has been properly initialized before using it in ufshcd_setup_clocks(). Do that unconditionally as initializing a spinlock is a fast operation. Fixes: 209f4e43b806 ("scsi: ufs: core: Introduce a new clock_gating lock") Reported-by: Dmitry Baryshkov Tested-by: Geert Uytterhoeven Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20250128071207.75494-2-avri.altman@wdc.com Reviewed-by: Bean Huo Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 356e1be33f534..db20b1d505b79 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -2120,8 +2120,6 @@ static void ufshcd_init_clk_gating(struct ufs_hba *hba) INIT_DELAYED_WORK(&hba->clk_gating.gate_work, ufshcd_gate_work); INIT_WORK(&hba->clk_gating.ungate_work, ufshcd_ungate_work); - spin_lock_init(&hba->clk_gating.lock); - hba->clk_gating.clk_gating_workq = alloc_ordered_workqueue( "ufs_clk_gating_%d", WQ_MEM_RECLAIM | WQ_HIGHPRI, hba->host->host_no); @@ -10411,6 +10409,12 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) hba->irq = irq; hba->vps = &ufs_hba_vps; + /* + * Initialize clk_gating.lock early since it is being used in + * ufshcd_setup_clocks() + */ + spin_lock_init(&hba->clk_gating.lock); + err = ufshcd_hba_init(hba); if (err) goto out_error; -- GitLab From 839a74b5649c9f41d939a05059b5ca6b17156d03 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Tue, 28 Jan 2025 09:12:07 +0200 Subject: [PATCH 188/989] scsi: ufs: Fix toggling of clk_gating.state when clock gating is not allowed This commit addresses an issue where clk_gating.state is being toggled in ufshcd_setup_clocks() even if clock gating is not allowed. The fix is to add a check for hba->clk_gating.is_initialized before toggling clk_gating.state in ufshcd_setup_clocks(). Since clk_gating.lock is now initialized unconditionally, it can no longer lead to the spinlock being used before it is properly initialized, but instead it is mostly for documentation purposes. Fixes: 1ab27c9cf8b6 ("ufs: Add support for clock gating") Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20250128071207.75494-3-avri.altman@wdc.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index db20b1d505b79..d3741b1f43821 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9140,7 +9140,7 @@ static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on) if (!IS_ERR_OR_NULL(clki->clk) && clki->enabled) clk_disable_unprepare(clki->clk); } - } else if (!ret && on) { + } else if (!ret && on && hba->clk_gating.is_initialized) { scoped_guard(spinlock_irqsave, &hba->clk_gating.lock) hba->clk_gating.state = CLKS_ON; trace_ufshcd_clk_gating(dev_name(hba->dev), -- GitLab From 18c966b62819b9d3b99eac8fb8cdc8950826e0c2 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Thu, 23 Jan 2025 09:30:44 +0800 Subject: [PATCH 189/989] HID: intel-ish-hid: ipc: Add Panther Lake PCI device IDs Add device IDs of Panther Lake-H and Panther Lake-P into ishtp support list. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/hw-ish.h | 2 ++ drivers/hid/intel-ish-hid/ipc/pci-ish.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index cdd80c653918b..07e90d51f073c 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -36,6 +36,8 @@ #define PCI_DEVICE_ID_INTEL_ISH_ARL_H 0x7745 #define PCI_DEVICE_ID_INTEL_ISH_ARL_S 0x7F78 #define PCI_DEVICE_ID_INTEL_ISH_LNL_M 0xA845 +#define PCI_DEVICE_ID_INTEL_ISH_PTL_H 0xE345 +#define PCI_DEVICE_ID_INTEL_ISH_PTL_P 0xE445 #define REVISION_ID_CHT_A0 0x6 #define REVISION_ID_CHT_Ax_SI 0x0 diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 9e2401291a2f6..ff0fc80100728 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -26,9 +26,11 @@ enum ishtp_driver_data_index { ISHTP_DRIVER_DATA_NONE, ISHTP_DRIVER_DATA_LNL_M, + ISHTP_DRIVER_DATA_PTL, }; #define ISH_FW_GEN_LNL_M "lnlm" +#define ISH_FW_GEN_PTL "ptl" #define ISH_FIRMWARE_PATH(gen) "intel/ish/ish_" gen ".bin" #define ISH_FIRMWARE_PATH_ALL "intel/ish/ish_*.bin" @@ -37,6 +39,9 @@ static struct ishtp_driver_data ishtp_driver_data[] = { [ISHTP_DRIVER_DATA_LNL_M] = { .fw_generation = ISH_FW_GEN_LNL_M, }, + [ISHTP_DRIVER_DATA_PTL] = { + .fw_generation = ISH_FW_GEN_PTL, + }, }; static const struct pci_device_id ish_pci_tbl[] = { @@ -63,6 +68,8 @@ static const struct pci_device_id ish_pci_tbl[] = { {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_H)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_ARL_S)}, {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_LNL_M), .driver_data = ISHTP_DRIVER_DATA_LNL_M}, + {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_H), .driver_data = ISHTP_DRIVER_DATA_PTL}, + {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_ISH_PTL_P), .driver_data = ISHTP_DRIVER_DATA_PTL}, {} }; MODULE_DEVICE_TABLE(pci, ish_pci_tbl); -- GitLab From 9271af9d846c7e49c8709b58d5853cb73c00b193 Mon Sep 17 00:00:00 2001 From: Daniel Brackenbury Date: Tue, 28 Jan 2025 20:08:49 -0500 Subject: [PATCH 190/989] HID: topre: Fix n-key rollover on Realforce R3S TKL boards Newer model R3* Topre Realforce keyboards share an issue with their older R2 cousins where a report descriptor fixup is needed in order for n-key rollover to work correctly, otherwise only 6-key rollover is available. This patch adds some new hardware IDs for the R3S 87-key keyboard and makes amendments to the existing hid-topre driver in order to change the correct byte in the new model. Signed-off-by: Daniel Brackenbury Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 3 ++- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-topre.c | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index ed657ef7281c8..dfc245867a46a 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1169,7 +1169,8 @@ config HID_TOPRE tristate "Topre REALFORCE keyboards" depends on HID help - Say Y for N-key rollover support on Topre REALFORCE R2 108/87 key keyboards. + Say Y for N-key rollover support on Topre REALFORCE R2 108/87 key and + Topre REALFORCE R3S 87 key keyboards. config HID_THINGM tristate "ThingM blink(1) USB RGB LED" diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 7debfe0c5cb98..ed1d7f9e8caf4 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1302,6 +1302,7 @@ #define USB_VENDOR_ID_TOPRE 0x0853 #define USB_DEVICE_ID_TOPRE_REALFORCE_R2_108 0x0148 #define USB_DEVICE_ID_TOPRE_REALFORCE_R2_87 0x0146 +#define USB_DEVICE_ID_TOPRE_REALFORCE_R3S_87 0x0313 #define USB_VENDOR_ID_TOPSEED 0x0766 #define USB_DEVICE_ID_TOPSEED_CYBERLINK 0x0204 diff --git a/drivers/hid/hid-topre.c b/drivers/hid/hid-topre.c index 848361f6225df..ccedf8721722e 100644 --- a/drivers/hid/hid-topre.c +++ b/drivers/hid/hid-topre.c @@ -29,6 +29,11 @@ static const __u8 *topre_report_fixup(struct hid_device *hdev, __u8 *rdesc, hid_info(hdev, "fixing up Topre REALFORCE keyboard report descriptor\n"); rdesc[72] = 0x02; + } else if (*rsize >= 106 && rdesc[28] == 0x29 && rdesc[29] == 0xe7 && + rdesc[30] == 0x81 && rdesc[31] == 0x00) { + hid_info(hdev, + "fixing up Topre REALFORCE keyboard report descriptor\n"); + rdesc[31] = 0x02; } return rdesc; } @@ -38,6 +43,8 @@ static const struct hid_device_id topre_id_table[] = { USB_DEVICE_ID_TOPRE_REALFORCE_R2_108) }, { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, USB_DEVICE_ID_TOPRE_REALFORCE_R2_87) }, + { HID_USB_DEVICE(USB_VENDOR_ID_TOPRE, + USB_DEVICE_ID_TOPRE_REALFORCE_R3S_87) }, { } }; MODULE_DEVICE_TABLE(hid, topre_id_table); -- GitLab From 5363ee9d110e139584c2d92a0b640bc210588506 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 28 Jan 2025 16:35:39 -0500 Subject: [PATCH 191/989] scsi: core: Use GFP_NOIO to avoid circular locking dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filesystems can write to disk from page reclaim with __GFP_FS set. Marc found a case where scsi_realloc_sdev_budget_map() ends up in page reclaim with GFP_KERNEL, where it could try to take filesystem locks again, leading to a deadlock. WARNING: possible circular locking dependency detected 6.13.0 #1 Not tainted ------------------------------------------------------ kswapd0/70 is trying to acquire lock: ffff8881025d5d78 (&q->q_usage_counter(io)){++++}-{0:0}, at: blk_mq_submit_bio+0x461/0x6e0 but task is already holding lock: ffffffff81ef5f40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x9f/0x760 The full lockdep splat can be found in Marc's report: https://lkml.org/lkml/2025/1/24/1101 Avoid the potential deadlock by doing the allocation with GFP_NOIO, which prevents both filesystem and block layer recursion. Reported-by: Marc Aurèle La France Signed-off-by: Rik van Riel Link: https://lore.kernel.org/r/20250129104525.0ae8421e@fangorn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 087fcbfc9aaa3..96d7e1a9a7c7a 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -246,7 +246,7 @@ static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev, } ret = sbitmap_init_node(&sdev->budget_map, scsi_device_max_queue_depth(sdev), - new_shift, GFP_KERNEL, + new_shift, GFP_NOIO, sdev->request_queue->node, false, true); if (!ret) sbitmap_resize(&sdev->budget_map, depth); -- GitLab From 9ff7c383b8ac0c482a1da7989f703406d78445c6 Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Fri, 31 Jan 2025 10:44:07 -0800 Subject: [PATCH 192/989] scsi: core: Do not retry I/Os during depopulation Fail I/Os instead of retry to prevent user space processes from being blocked on the I/O completion for several minutes. Retrying I/Os during "depopulation in progress" or "depopulation restore in progress" results in a continuous retry loop until the depopulation completes or until the I/O retry loop is aborted due to a timeout by the scsi_cmd_runtime_exceeced(). Depopulation is slow and can take 24+ hours to complete on 20+ TB HDDs. Most I/Os in the depopulation retry loop end up taking several minutes before returning the failure to user space. Cc: stable@vger.kernel.org # 4.18.x: 2bbeb8d scsi: core: Handle depopulation and restoration in progress Cc: stable@vger.kernel.org # 4.18.x Fixes: e37c7d9a0341 ("scsi: core: sanitize++ in progress") Signed-off-by: Igor Pylypiv Link: https://lore.kernel.org/r/20250131184408.859579-1-ipylypiv@google.com Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d776f13cd160b..be0890e4e7062 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -872,13 +872,18 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) case 0x1a: /* start stop unit in progress */ case 0x1b: /* sanitize in progress */ case 0x1d: /* configuration in progress */ - case 0x24: /* depopulation in progress */ - case 0x25: /* depopulation restore in progress */ action = ACTION_DELAYED_RETRY; break; case 0x0a: /* ALUA state transition */ action = ACTION_DELAYED_REPREP; break; + /* + * Depopulation might take many hours, + * thus it is not worthwhile to retry. + */ + case 0x24: /* depopulation in progress */ + case 0x25: /* depopulation restore in progress */ + fallthrough; default: action = ACTION_FAIL; break; -- GitLab From f8fb2403ddebb5eea0033d90d9daae4c88749ada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Fri, 24 Jan 2025 15:09:00 +0000 Subject: [PATCH 193/989] scsi: ufs: core: Fix use-after free in init error and remove paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_blk_crypto_profile_init() registers a cleanup handler to run when the associated (platform-) device is being released. For UFS, the crypto private data and pointers are stored as part of the ufs_hba's data structure 'struct ufs_hba::crypto_profile'. This structure is allocated as part of the underlying ufshcd and therefore Scsi_host allocation. During driver release or during error handling in ufshcd_pltfrm_init(), this structure is released as part of ufshcd_dealloc_host() before the (platform-) device associated with the crypto call above is released. Once this device is released, the crypto cleanup code will run, using the just-released 'struct ufs_hba::crypto_profile'. This causes a use-after-free situation: Call trace: kfree+0x60/0x2d8 (P) kvfree+0x44/0x60 blk_crypto_profile_destroy_callback+0x28/0x70 devm_action_release+0x1c/0x30 release_nodes+0x6c/0x108 devres_release_all+0x98/0x100 device_unbind_cleanup+0x20/0x70 really_probe+0x218/0x2d0 In other words, the initialisation code flow is: platform-device probe ufshcd_pltfrm_init() ufshcd_alloc_host() scsi_host_alloc() allocation of struct ufs_hba creation of scsi-host devices devm_blk_crypto_profile_init() devm registration of cleanup handler using platform-device and during error handling of ufshcd_pltfrm_init() or during driver removal: ufshcd_dealloc_host() scsi_host_put() put_device(scsi-host) release of struct ufs_hba put_device(platform-device) crypto cleanup handler To fix this use-after free, change ufshcd_alloc_host() to register a devres action to automatically cleanup the underlying SCSI device on ufshcd destruction, without requiring explicit calls to ufshcd_dealloc_host(). This way: * the crypto profile and all other ufs_hba-owned resources are destroyed before SCSI (as they've been registered after) * a memleak is plugged in tc-dwc-g210-pci.c remove() as a side-effect * EXPORT_SYMBOL_GPL(ufshcd_dealloc_host) can be removed fully as it's not needed anymore * no future drivers using ufshcd_alloc_host() could ever forget adding the cleanup Fixes: cb77cb5abe1f ("blk-crypto: rename blk_keyslot_manager to blk_crypto_profile") Fixes: d76d9d7d1009 ("scsi: ufs: use devm_blk_ksm_init()") Cc: stable@vger.kernel.org Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250124-ufshcd-fix-v4-1-c5d0144aae59@linaro.org Reviewed-by: Bean Huo Reviewed-by: Manivannan Sadhasivam Acked-by: Eric Biggers Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 31 +++++++++++++++++++++---------- drivers/ufs/host/ufshcd-pci.c | 2 -- drivers/ufs/host/ufshcd-pltfrm.c | 28 +++++++++------------------- include/ufs/ufshcd.h | 1 - 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d3741b1f43821..d2de80b2bba46 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10226,16 +10226,6 @@ int ufshcd_system_thaw(struct device *dev) EXPORT_SYMBOL_GPL(ufshcd_system_thaw); #endif /* CONFIG_PM_SLEEP */ -/** - * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA) - * @hba: pointer to Host Bus Adapter (HBA) - */ -void ufshcd_dealloc_host(struct ufs_hba *hba) -{ - scsi_host_put(hba->host); -} -EXPORT_SYMBOL_GPL(ufshcd_dealloc_host); - /** * ufshcd_set_dma_mask - Set dma mask based on the controller * addressing capability @@ -10254,12 +10244,26 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba) return dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(32)); } +/** + * ufshcd_devres_release - devres cleanup handler, invoked during release of + * hba->dev + * @host: pointer to SCSI host + */ +static void ufshcd_devres_release(void *host) +{ + scsi_host_put(host); +} + /** * ufshcd_alloc_host - allocate Host Bus Adapter (HBA) * @dev: pointer to device handle * @hba_handle: driver private handle * * Return: 0 on success, non-zero value on failure. + * + * NOTE: There is no corresponding ufshcd_dealloc_host() because this function + * keeps track of its allocations using devres and deallocates everything on + * device removal automatically. */ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) { @@ -10281,6 +10285,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = -ENOMEM; goto out_error; } + + err = devm_add_action_or_reset(dev, ufshcd_devres_release, + host); + if (err) + return dev_err_probe(dev, err, + "failed to add ufshcd dealloc action\n"); + host->nr_maps = HCTX_TYPE_POLL + 1; hba = shost_priv(host); hba->host = host; diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c index ea39c5d5b8cf1..9cfcaad23cf92 100644 --- a/drivers/ufs/host/ufshcd-pci.c +++ b/drivers/ufs/host/ufshcd-pci.c @@ -562,7 +562,6 @@ static void ufshcd_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(&pdev->dev); pm_runtime_get_noresume(&pdev->dev); ufshcd_remove(hba); - ufshcd_dealloc_host(hba); } /** @@ -605,7 +604,6 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = ufshcd_init(hba, mmio_base, pdev->irq); if (err) { dev_err(&pdev->dev, "Initialization failed\n"); - ufshcd_dealloc_host(hba); return err; } diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c index 505572d4fa878..ffe5d1d2b2158 100644 --- a/drivers/ufs/host/ufshcd-pltfrm.c +++ b/drivers/ufs/host/ufshcd-pltfrm.c @@ -465,21 +465,17 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, struct device *dev = &pdev->dev; mmio_base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(mmio_base)) { - err = PTR_ERR(mmio_base); - goto out; - } + if (IS_ERR(mmio_base)) + return PTR_ERR(mmio_base); irq = platform_get_irq(pdev, 0); - if (irq < 0) { - err = irq; - goto out; - } + if (irq < 0) + return irq; err = ufshcd_alloc_host(dev, &hba); if (err) { dev_err(dev, "Allocation failed\n"); - goto out; + return err; } hba->vops = vops; @@ -488,13 +484,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, if (err) { dev_err(dev, "%s: clock parse failed %d\n", __func__, err); - goto dealloc_host; + return err; } err = ufshcd_parse_regulator_info(hba); if (err) { dev_err(dev, "%s: regulator init failed %d\n", __func__, err); - goto dealloc_host; + return err; } ufshcd_init_lanes_per_dir(hba); @@ -502,25 +498,20 @@ int ufshcd_pltfrm_init(struct platform_device *pdev, err = ufshcd_parse_operating_points(hba); if (err) { dev_err(dev, "%s: OPP parse failed %d\n", __func__, err); - goto dealloc_host; + return err; } err = ufshcd_init(hba, mmio_base, irq); if (err) { dev_err_probe(dev, err, "Initialization failed with error %d\n", err); - goto dealloc_host; + return err; } pm_runtime_set_active(dev); pm_runtime_enable(dev); return 0; - -dealloc_host: - ufshcd_dealloc_host(hba); -out: - return err; } EXPORT_SYMBOL_GPL(ufshcd_pltfrm_init); @@ -534,7 +525,6 @@ void ufshcd_pltfrm_remove(struct platform_device *pdev) pm_runtime_get_sync(&pdev->dev); ufshcd_remove(hba); - ufshcd_dealloc_host(hba); pm_runtime_disable(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); } diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 650ff238cd74e..8bf31e6ca4e51 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1309,7 +1309,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg) void ufshcd_enable_irq(struct ufs_hba *hba); void ufshcd_disable_irq(struct ufs_hba *hba); int ufshcd_alloc_host(struct device *, struct ufs_hba **); -void ufshcd_dealloc_host(struct ufs_hba *); int ufshcd_hba_enable(struct ufs_hba *hba); int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int); int ufshcd_link_recovery(struct ufs_hba *hba); -- GitLab From 87c4b5e8a6b65189abd9ea5010ab308941f964a4 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 22 Jan 2025 19:07:22 -0800 Subject: [PATCH 194/989] scsi: storvsc: Set correct data length for sending SCSI command without payload In StorVSC, payload->range.len is used to indicate if this SCSI command carries payload. This data is allocated as part of the private driver data by the upper layer and may get passed to lower driver uninitialized. For example, the SCSI error handling mid layer may send TEST_UNIT_READY or REQUEST_SENSE while reusing the buffer from a failed command. The private data section may have stale data from the previous command. If the SCSI command doesn't carry payload, the driver may use this value as is for communicating with host, resulting in possible corruption. Fix this by always initializing this value. Fixes: be0cf6ca301c ("scsi: storvsc: Set the tablesize based on the information given by the host") Cc: stable@kernel.org Tested-by: Roman Kisel Reviewed-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Long Li Link: https://lore.kernel.org/r/1737601642-7759-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 5a101ac06c478..a8614e54544e5 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1800,6 +1800,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) length = scsi_bufflen(scmnd); payload = (struct vmbus_packet_mpb_array *)&cmd_request->mpb; + payload->range.len = 0; payload_sz = 0; if (scsi_sg_count(scmnd)) { -- GitLab From 1a78a56ea65252bb089e0daace989167227f2d31 Mon Sep 17 00:00:00 2001 From: Seunghui Lee Date: Sat, 18 Jan 2025 11:38:08 +0900 Subject: [PATCH 195/989] scsi: ufs: core: Fix error return with query response There is currently no mechanism to return error from query responses. Return the error and print the corresponding error message with it. Signed-off-by: Seunghui Lee Link: https://lore.kernel.org/r/20250118023808.24726-1-sh043.lee@samsung.com Reviewed-by: Bean Huo Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d2de80b2bba46..1893a7ad95316 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -3104,8 +3104,13 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) case UPIU_TRANSACTION_QUERY_RSP: { u8 response = lrbp->ucd_rsp_ptr->header.response; - if (response == 0) + if (response == 0) { err = ufshcd_copy_query_response(hba, lrbp); + } else { + err = -EINVAL; + dev_err(hba->dev, "%s: unexpected response in Query RSP: %x\n", + __func__, response); + } break; } case UPIU_TRANSACTION_REJECT_UPIU: -- GitLab From 5233e3235dec3065ccc632729675575dbe3c6b8a Mon Sep 17 00:00:00 2001 From: Magnus Lindholm Date: Sat, 25 Jan 2025 10:49:22 +0100 Subject: [PATCH 196/989] scsi: qla1280: Fix kernel oops when debug level > 2 A null dereference or oops exception will eventually occur when qla1280.c driver is compiled with DEBUG_QLA1280 enabled and ql_debug_level > 2. I think its clear from the code that the intention here is sg_dma_len(s) not length of sg_next(s) when printing the debug info. Signed-off-by: Magnus Lindholm Link: https://lore.kernel.org/r/20250125095033.26188-1-linmag7@gmail.com Signed-off-by: Martin K. Petersen --- drivers/scsi/qla1280.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 1fd2da0264e38..47d74f881948f 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -2867,7 +2867,7 @@ qla1280_64bit_start_scsi(struct scsi_qla_host *ha, struct srb * sp) dprintk(3, "S/G Segment phys_addr=%x %x, len=0x%x\n", cpu_to_le32(upper_32_bits(dma_handle)), cpu_to_le32(lower_32_bits(dma_handle)), - cpu_to_le32(sg_dma_len(sg_next(s)))); + cpu_to_le32(sg_dma_len(s))); remseg--; } dprintk(5, "qla1280_64bit_start_scsi: Scatter/gather " -- GitLab From 1b0332a42656b798bea867631d739de023633ec6 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Thu, 30 Jan 2025 22:48:49 +0800 Subject: [PATCH 197/989] kthread: Fix return value on kzalloc() failure in kthread_affine_preferred() kthread_affine_preferred() incorrectly returns 0 instead of -ENOMEM when kzalloc() fails. Return 'ret' to ensure the correct error code is propagated. Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501301528.t0cZVbnq-lkp@intel.com/ Signed-off-by: Yu-Chun Lin Signed-off-by: Frederic Weisbecker --- kernel/kthread.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 4005b13ebd7ff..5dc5b0d7238e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -859,7 +859,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; - int ret; + int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); @@ -892,7 +892,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) out: free_cpumask_var(affinity); - return 0; + return ret; } /* -- GitLab From 244f8aa46fa9e2f4ea5fe0e04988b395d5e30fc7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 31 Jan 2025 17:30:37 -0800 Subject: [PATCH 198/989] ethtool: rss: fix hiding unsupported fields in dumps Commit ec6e57beaf8b ("ethtool: rss: don't report key if device doesn't support it") intended to stop reporting key fields for additional rss contexts if device has a global hashing key. Later we added dump support and the filtering wasn't properly added there. So we end up reporting the key fields in dumps but not in dos: # ./pyynl/cli.py --spec netlink/specs/ethtool.yaml --do rss-get \ --json '{"header": {"dev-index":2}, "context": 1 }' { "header": { ... }, "context": 1, "indir": [0, 1, 2, 3, ...]] } # ./pyynl/cli.py --spec netlink/specs/ethtool.yaml --dump rss-get [ ... snip context 0 ... { "header": { ... }, "context": 1, "indir": [0, 1, 2, 3, ...], -> "input_xfrm": 255, -> "hfunc": 1, -> "hkey": "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" } ] Hide these fields correctly. The drivers/net/hw/rss_ctx.py selftest catches this when run on a device with single key, already: # Check| At /root/./ksft-net-drv/drivers/net/hw/rss_ctx.py, line 381, in test_rss_context_dump: # Check| ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero") # Check failed {0} == {0} key is all zero not ok 8 rss_ctx.test_rss_context_dump Fixes: f6122900f4e2 ("ethtool: rss: support dumping RSS contexts") Reviewed-by: Gal Pressman Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250201013040.725123-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 7cb106b590aba..58df9ad02ce8a 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -107,6 +107,8 @@ rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, u32 total_size, indir_bytes; u8 *rss_config; + data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); if (!ctx) return -ENOENT; @@ -153,7 +155,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) return -EOPNOTSUPP; - data->no_key_fields = !ops->rxfh_per_ctx_key; return rss_prepare_ctx(request, dev, data, info); } -- GitLab From 2b91cc1214b165c25ac9b0885db89a0d3224028a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 31 Jan 2025 17:30:38 -0800 Subject: [PATCH 199/989] ethtool: ntuple: fix rss + ring_cookie check The info.flow_type is for RXFH commands, ntuple flow_type is inside the flow spec. The check currently does nothing, as info.flow_type is 0 (or even uninitialized by user space) for ETHTOOL_SRXCLSRLINS. Fixes: 9e43ad7a1ede ("net: ethtool: only allow set_rxnfc with rss + ring_cookie if driver opts in") Reviewed-by: Gal Pressman Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250201013040.725123-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 34bee42e12470..7609ce2b2c5e2 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -993,7 +993,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; /* Nonzero ring with RSS only makes sense if NIC adds them together */ - if (cmd == ETHTOOL_SRXCLSRLINS && info.flow_type & FLOW_RSS && + if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS && !ops->cap_rss_rxnfc_adds && ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; -- GitLab From de379dfd9ada2995699052f4a1ecebe5d8f8d70f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 31 Jan 2025 17:30:39 -0800 Subject: [PATCH 200/989] selftests: drv-net: rss_ctx: add missing cleanup in queue reconfigure Commit under Fixes adds ntuple rules but never deletes them. Fixes: 29a4bc1fe961 ("selftest: extend test_rss_context_queue_reconfigure for action addition") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250201013040.725123-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/rss_ctx.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index ca8a7edff3dda..27e24e20749ff 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -252,6 +252,7 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True): try: # this targets queue 4, which doesn't exist ntuple2 = ethtool_create(cfg, "-N", flow) + defer(ethtool, f"-N {cfg.ifname} delete {ntuple2}") except CmdExitFailure: pass else: @@ -260,6 +261,7 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True): ethtool(f"-X {cfg.ifname} {ctx_ref} weight 1 0 1 0") # ntuple rule therefore targets queues 1 and 3 ntuple2 = ethtool_create(cfg, "-N", flow) + defer(ethtool, f"-N {cfg.ifname} delete {ntuple2}") # should replace existing filter ksft_eq(ntuple, ntuple2) _send_traffic_check(cfg, port, ctx_ref, { 'target': (1, 3), -- GitLab From c3da585509aeb8476886adf75a266c81a9b0df6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 31 Jan 2025 17:30:40 -0800 Subject: [PATCH 201/989] selftests: drv-net: rss_ctx: don't fail reconfigure test if queue offset not supported Vast majority of drivers does not support queue offset. Simply return if the rss context + queue ntuple fails. Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250201013040.725123-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/rss_ctx.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 27e24e20749ff..319aaa004c407 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -260,7 +260,12 @@ def test_rss_queue_reconfigure(cfg, main_ctx=True): # change the table to target queues 0 and 2 ethtool(f"-X {cfg.ifname} {ctx_ref} weight 1 0 1 0") # ntuple rule therefore targets queues 1 and 3 - ntuple2 = ethtool_create(cfg, "-N", flow) + try: + ntuple2 = ethtool_create(cfg, "-N", flow) + except CmdExitFailure: + ksft_pr("Driver does not support rss + queue offset") + return + defer(ethtool, f"-N {cfg.ifname} delete {ntuple2}") # should replace existing filter ksft_eq(ntuple, ntuple2) -- GitLab From d3ed6dee73c560fad0a8e152c8e233b3fb3a2e44 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 1 Feb 2025 19:02:51 +0100 Subject: [PATCH 202/989] net: harmonize tstats and dstats After the blamed commits below, some UDP tunnel use dstats for accounting. On the xmit path, all the UDP-base tunnels ends up using iptunnel_xmit_stats() for stats accounting, and the latter assumes the relevant (tunnel) network device uses tstats. The end result is some 'funny' stat report for the mentioned UDP tunnel, e.g. when no packet is actually dropped and a bunch of packets are transmitted: gnv2: mtu 1450 qdisc noqueue \ state UNKNOWN mode DEFAULT group default qlen 1000 link/ether ee:7d:09:87:90:ea brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 14916 23 0 15 0 0 TX: bytes packets errors dropped carrier collsns 0 1566 0 0 0 0 Address the issue ensuring the same binary layout for the overlapping fields of dstats and tstats. While this solution is a bit hackish, is smaller and with no performance pitfall compared to other alternatives i.e. supporting both dstat and tstat in iptunnel_xmit_stats() or reverting the blamed commit. With time we should possibly move all the IP-based tunnel (and virtual devices) to dstats. Fixes: c77200c07491 ("bareudp: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Fixes: 6fa6de302246 ("geneve: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Fixes: be226352e8dc ("vxlan: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Signed-off-by: Paolo Abeni Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/2e1c444cf0f63ae472baff29862c4c869be17031.1738432804.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- net/core/dev.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2f..03bb584c62cf8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2904,9 +2904,9 @@ struct pcpu_sw_netstats { struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; - u64_stats_t rx_drops; u64_stats_t tx_packets; u64_stats_t tx_bytes; + u64_stats_t rx_drops; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); diff --git a/net/core/dev.c b/net/core/dev.c index c0021cbd28fc1..b91658e8aedb4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11286,6 +11286,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); -- GitLab From fcf5d353b09b3fc212ab24b89ef23a7a8f7b308e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Jan 2025 07:52:44 +0100 Subject: [PATCH 203/989] phy: rockchip: fix Kconfig dependency more A previous patch ensured that USB Type C connector support is enabled, but it is still possible to build the phy driver without enabling CONFIG_USB (host support) or CONFIG_USB_GADGET (device support), and in that case the common helper functions are unavailable: aarch64-linux-ld: drivers/phy/rockchip/phy-rockchip-usbdp.o: in function `rk_udphy_probe': phy-rockchip-usbdp.c:(.text+0xe74): undefined reference to `usb_get_maximum_speed' Select CONFIG_USB_COMMON directly here, like we do in some other phy drivers, to make sure this is available even when actual USB support is disabled or in a loadable module that cannot be reached from a built-in phy driver. Fixes: 9c79b779643e ("phy: rockchip: fix CONFIG_TYPEC dependency") Signed-off-by: Arnd Bergmann Reviewed-by: Sebastian Reichel Reviewed-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250122065249.1390081-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/rockchip/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/phy/rockchip/Kconfig b/drivers/phy/rockchip/Kconfig index 2f7a05f21dc59..dcb8e1628632e 100644 --- a/drivers/phy/rockchip/Kconfig +++ b/drivers/phy/rockchip/Kconfig @@ -125,6 +125,7 @@ config PHY_ROCKCHIP_USBDP depends on ARCH_ROCKCHIP && OF depends on TYPEC select GENERIC_PHY + select USB_COMMON help Enable this to support the Rockchip USB3.0/DP combo PHY with Samsung IP block. This is required for USB3 support on RK3588. -- GitLab From 3126ea9be66b53e607f87f067641ba724be24181 Mon Sep 17 00:00:00 2001 From: Chukun Pan Date: Mon, 6 Jan 2025 18:00:01 +0800 Subject: [PATCH 204/989] phy: rockchip: naneng-combphy: compatible reset with old DT The device tree of RK3568 did not specify reset-names before. So add fallback to old behaviour to be compatible with old DT. Fixes: fbcbffbac994 ("phy: rockchip: naneng-combphy: fix phy reset") Cc: Jianfeng Liu Signed-off-by: Chukun Pan Reviewed-by: Jonas Karlman Link: https://lore.kernel.org/r/20250106100001.1344418-2-amadeus@jmu.edu.cn Signed-off-by: Vinod Koul --- drivers/phy/rockchip/phy-rockchip-naneng-combphy.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c index a1532ef8bbe9d..8c3ce57f89151 100644 --- a/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c +++ b/drivers/phy/rockchip/phy-rockchip-naneng-combphy.c @@ -324,7 +324,10 @@ static int rockchip_combphy_parse_dt(struct device *dev, struct rockchip_combphy priv->ext_refclk = device_property_present(dev, "rockchip,ext-refclk"); - priv->phy_rst = devm_reset_control_get(dev, "phy"); + priv->phy_rst = devm_reset_control_get_exclusive(dev, "phy"); + /* fallback to old behaviour */ + if (PTR_ERR(priv->phy_rst) == -ENOENT) + priv->phy_rst = devm_reset_control_array_get_exclusive(dev); if (IS_ERR(priv->phy_rst)) return dev_err_probe(dev, PTR_ERR(priv->phy_rst), "failed to get phy reset\n"); -- GitLab From a787ab73e2e43c0a3df10bc8d9b9b7a679129d49 Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Fri, 31 Jan 2025 12:53:15 -0800 Subject: [PATCH 205/989] platform/x86/intel/ifs: Update documentation with image download path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documentation previously listed the path to download In Field Scan (IFS) test images as "TBD". Update the documentation to include the correct image download location. Also move the download link to the appropriate section within the documentation. Reported-by: Anisse Astier Signed-off-by: Jithu Joseph Link: https://lore.kernel.org/r/20250131205315.1585663-1-jithu.joseph@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/ifs/ifs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/intel/ifs/ifs.h b/drivers/platform/x86/intel/ifs/ifs.h index 5c3c0dfa1bf83..f369fb0d3d82f 100644 --- a/drivers/platform/x86/intel/ifs/ifs.h +++ b/drivers/platform/x86/intel/ifs/ifs.h @@ -23,12 +23,14 @@ * IFS Image * --------- * - * Intel provides a firmware file containing the scan tests via - * github [#f1]_. Similar to microcode there is a separate file for each + * Intel provides firmware files containing the scan tests via the webpage [#f1]_. + * Look under "In-Field Scan Test Images Download" section towards the + * end of the page. Similar to microcode, there are separate files for each * family-model-stepping. IFS Images are not applicable for some test types. * Wherever applicable the sysfs directory would provide a "current_batch" file * (see below) for loading the image. * + * .. [#f1] https://intel.com/InFieldScan * * IFS Image Loading * ----------------- @@ -125,9 +127,6 @@ * 2) Hardware allows for some number of cores to be tested in parallel. * The driver does not make use of this, it only tests one core at a time. * - * .. [#f1] https://github.com/intel/TBD - * - * * Structural Based Functional Test at Field (SBAF): * ------------------------------------------------- * -- GitLab From 1739cafdb8decad538410b05a4640055408826de Mon Sep 17 00:00:00 2001 From: Jinghao Jia Date: Mon, 3 Feb 2025 02:55:04 -0600 Subject: [PATCH 206/989] samples/hid: remove unnecessary -I flags from libbpf EXTRA_CFLAGS Commit 5a6ea7022ff4 ("samples/bpf: Remove unnecessary -I flags from libbpf EXTRA_CFLAGS") fixed the build error caused by redundant include path for samples/bpf, but not samples/hid. Apply the same fix on samples/hid as well. Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Tested-by: Ruowen Qin Signed-off-by: Jinghao Jia Link: https://patch.msgid.link/20250203085506.220297-2-jinghao7@illinois.edu Signed-off-by: Benjamin Tissoires --- samples/hid/Makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/samples/hid/Makefile b/samples/hid/Makefile index 8ea59e9631a33..69159c81d0457 100644 --- a/samples/hid/Makefile +++ b/samples/hid/Makefile @@ -40,16 +40,17 @@ BPF_EXTRA_CFLAGS += -I$(srctree)/arch/mips/include/asm/mach-generic endif endif -TPROGS_CFLAGS += -Wall -O2 -TPROGS_CFLAGS += -Wmissing-prototypes -TPROGS_CFLAGS += -Wstrict-prototypes +COMMON_CFLAGS += -Wall -O2 +COMMON_CFLAGS += -Wmissing-prototypes +COMMON_CFLAGS += -Wstrict-prototypes +TPROGS_CFLAGS += $(COMMON_CFLAGS) TPROGS_CFLAGS += -I$(objtree)/usr/include TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) TPROGS_CFLAGS += -I$(srctree)/tools/include ifdef SYSROOT -TPROGS_CFLAGS += --sysroot=$(SYSROOT) +COMMON_CFLAGS += --sysroot=$(SYSROOT) TPROGS_LDFLAGS := -L$(SYSROOT)/usr/lib endif @@ -112,7 +113,7 @@ clean: $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) # Fix up variables inherited from Kbuild that tools/ build system won't like - $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ + $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(HID_SAMPLES_PATH)/../../ \ O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ $@ install_headers -- GitLab From 8b125949df58a00e8797c6e6d3f3d3dc08f4d939 Mon Sep 17 00:00:00 2001 From: Jinghao Jia Date: Mon, 3 Feb 2025 02:55:06 -0600 Subject: [PATCH 207/989] samples/hid: fix broken vmlinux path for VMLINUX_BTF Commit 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") changed kbuild working directory of hid-bpf sample programs to samples/hid, which broke the vmlinux path for VMLINUX_BTF, as the Makefiles assume the current work directory to be the kernel output directory and use a relative path (i.e., ./vmlinux): Makefile:173: *** Cannot find a vmlinux for VMLINUX_BTF at any of " /path/to/linux/samples/hid/vmlinux", build the kernel or set VMLINUX_BTF or VMLINUX_H variable. Stop. Correctly refer to the kernel output directory using $(objtree). Fixes: 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") Tested-by: Ruowen Qin Suggested-by: Daniel Borkmann Suggested-by: Andrii Nakryiko Signed-off-by: Jinghao Jia Link: https://patch.msgid.link/20250203085506.220297-4-jinghao7@illinois.edu Signed-off-by: Benjamin Tissoires --- samples/hid/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/hid/Makefile b/samples/hid/Makefile index 69159c81d0457..db5a077c77fc8 100644 --- a/samples/hid/Makefile +++ b/samples/hid/Makefile @@ -164,7 +164,7 @@ $(obj)/hid_surface_dial.o: $(obj)/hid_surface_dial.skel.h VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ - $(abspath ./vmlinux) + $(abspath $(objtree)/vmlinux) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) $(obj)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) -- GitLab From 32392e04cb50d87bb7a6a7d9213f44a1a0961820 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Mon, 3 Feb 2025 15:15:43 -0800 Subject: [PATCH 208/989] KVM: arm64: Fail protected mode init if no vgic hardware is present Protected mode assumes that at minimum vgic-v3 is present, however KVM fails to actually enforce this at the time of initialization. As such, when running protected mode in a half-baked state on GICv2 hardware we see the hyp go belly up at vcpu_load() when it tries to restore the vgic-v3 cpuif: $ ./arch_timer_edge_cases [ 130.599140] kvm [4518]: nVHE hyp panic at: [] __kvm_nvhe___vgic_v3_restore_vmcr_aprs+0x8/0x84! [ 130.603685] kvm [4518]: Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE [ 130.611962] kvm [4518]: Hyp Offset: 0xfffeca95ed000000 [ 130.617053] Kernel panic - not syncing: HYP panic: [ 130.617053] PS:800003c9 PC:0000b56a94102b58 ESR:0000000002000000 [ 130.617053] FAR:ffff00007b98d4d0 HPFAR:00000000007b98d0 PAR:0000000000000000 [ 130.617053] VCPU:0000000000000000 [ 130.638013] CPU: 0 UID: 0 PID: 4518 Comm: arch_timer_edge Tainted: G C 6.13.0-rc3-00009-gf7d03fcbf1f4 #1 [ 130.648790] Tainted: [C]=CRAP [ 130.651721] Hardware name: Libre Computer AML-S905X-CC (DT) [ 130.657242] Call trace: [ 130.659656] show_stack+0x18/0x24 (C) [ 130.663279] dump_stack_lvl+0x38/0x90 [ 130.666900] dump_stack+0x18/0x24 [ 130.670178] panic+0x388/0x3e8 [ 130.673196] nvhe_hyp_panic_handler+0x104/0x208 [ 130.677681] kvm_arch_vcpu_load+0x290/0x548 [ 130.681821] vcpu_load+0x50/0x80 [ 130.685013] kvm_arch_vcpu_ioctl_run+0x30/0x868 [ 130.689498] kvm_vcpu_ioctl+0x2e0/0x974 [ 130.693293] __arm64_sys_ioctl+0xb4/0xec [ 130.697174] invoke_syscall+0x48/0x110 [ 130.700883] el0_svc_common.constprop.0+0x40/0xe0 [ 130.705540] do_el0_svc+0x1c/0x28 [ 130.708818] el0_svc+0x30/0xd0 [ 130.711837] el0t_64_sync_handler+0x10c/0x138 [ 130.716149] el0t_64_sync+0x198/0x19c [ 130.719774] SMP: stopping secondary CPUs [ 130.723660] Kernel Offset: disabled [ 130.727103] CPU features: 0x000,00000800,02800000,0200421b [ 130.732537] Memory Limit: none [ 130.735561] ---[ end Kernel panic - not syncing: HYP panic: [ 130.735561] PS:800003c9 PC:0000b56a94102b58 ESR:0000000002000000 [ 130.735561] FAR:ffff00007b98d4d0 HPFAR:00000000007b98d0 PAR:0000000000000000 [ 130.735561] VCPU:0000000000000000 ]--- Fix it by failing KVM initialization if the system doesn't implement vgic-v3, as protected mode will never do anything useful on such hardware. Reported-by: Mark Brown Closes: https://lore.kernel.org/kvmarm/5ca7588c-7bf2-4352-8661-e4a56a9cd9aa@sirena.org.uk/ Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250203231543.233511-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0725a0b50a3e9..62c650c2f7b67 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2290,6 +2290,19 @@ static int __init init_subsystems(void) break; case -ENODEV: case -ENXIO: + /* + * No VGIC? No pKVM for you. + * + * Protected mode assumes that VGICv3 is present, so no point + * in trying to hobble along if vgic initialization fails. + */ + if (is_protected_kvm_enabled()) + goto out; + + /* + * Otherwise, userspace could choose to implement a GIC for its + * guest on non-cooperative hardware. + */ vgic_present = false; err = 0; break; -- GitLab From 3648027de1fa91a0c80cffd3ecff263d06e62605 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 10 Jan 2025 18:51:46 +0100 Subject: [PATCH 209/989] arm64: Fix 5-level paging support in kexec/hibernate trampoline Add the missing code to allocate P4D level page tables when cloning the the kernel page tables. This fixes a crash that may be observed when attempting to resume from hibernation on an LPA2 capable system with 4k pages, which therefore uses 5 levels of paging. Presumably, kexec is equally affected. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250110175145.785702-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/mm/trans_pgd.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c index 19c67ed1a21fe..18543b603c77b 100644 --- a/arch/arm64/mm/trans_pgd.c +++ b/arch/arm64/mm/trans_pgd.c @@ -162,6 +162,13 @@ static int copy_p4d(struct trans_pgd_info *info, pgd_t *dst_pgdp, unsigned long next; unsigned long addr = start; + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_p4dp = trans_alloc(info); + if (!dst_p4dp) + return -ENOMEM; + pgd_populate(NULL, dst_pgdp, dst_p4dp); + } + dst_p4dp = p4d_offset(dst_pgdp, start); src_p4dp = p4d_offset(src_pgdp, start); do { -- GitLab From f458b2165d7ac0f2401fff48f19c8f864e7e1e38 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 17 Jan 2025 07:55:22 -0500 Subject: [PATCH 210/989] arm64: Kconfig: Remove selecting replaced HAVE_FUNCTION_GRAPH_RETVAL Commit a3ed4157b7d8 ("fgraph: Replace fgraph_ret_regs with ftrace_regs") replaces the config HAVE_FUNCTION_GRAPH_RETVAL with the config HAVE_FUNCTION_GRAPH_FREGS, and it replaces all the select commands in the various architecture Kconfig files. In the arm64 architecture, the commit adds the 'select HAVE_FUNCTION_GRAPH_FREGS', but misses to remove the 'select HAVE_FUNCTION_GRAPH_RETVAL', i.e., the select on the replaced config. Remove selecting the replaced config. No functional change, just cleanup. Fixes: a3ed4157b7d8 ("fgraph: Replace fgraph_ret_regs with ftrace_regs") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20250117125522.99071-1-lukas.bulwahn@redhat.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fcdd0ed3eca89..940343beb3d4c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,7 +225,6 @@ config ARM64 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_RETVAL select HAVE_GCC_PLUGINS select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \ HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI -- GitLab From f64f9dddd1f58c41c140034f7d2b0beeef1bc548 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 24 Jan 2025 17:33:22 +0000 Subject: [PATCH 211/989] arm64/gcs: Fix documentation for HWCAP In one of the renumberings of the GCS hwcap a stray reference to HWCAP2 was left, fix it. Reported-by: David Spickett Fixes: 7058bf87cd59 ("arm64/gcs: Document the ABI for Guarded Control Stacks") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250124-arm64-gcs-hwcap-doc-v1-1-fa9368b01ca6@kernel.org Signed-off-by: Will Deacon --- Documentation/arch/arm64/gcs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst index 1f65a3193e776..226c0b008456f 100644 --- a/Documentation/arch/arm64/gcs.rst +++ b/Documentation/arch/arm64/gcs.rst @@ -37,7 +37,7 @@ intended to be exhaustive. shadow stacks rather than GCS. * Support for GCS is reported to userspace via HWCAP_GCS in the aux vector - AT_HWCAP2 entry. + AT_HWCAP entry. * GCS is enabled per thread. While there is support for disabling GCS at runtime this should be done with great care. -- GitLab From 21fed7c223e20e694b91dbf25936d922a50c8b19 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 3 Feb 2025 20:11:04 +0000 Subject: [PATCH 212/989] arm64/hwcap: Remove stray references to SF8MMx Due to SME currently being disabled when removing the SF8MMx support it wasn't noticed that there were some stray references in the hwcap table, delete them. Fixes: 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250203-arm64-remove-sf8mmx-v1-1-6f1da3dbff82@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4eb7c6698ae43..f0910f20fbf8c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3180,8 +3180,6 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4), HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM8), - HWCAP_CAP(ID_AA64SMFR0_EL1, SF8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8MM4), HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM), HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES), HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA), -- GitLab From ba69e0750b0362870294adab09339a0c39c3beaf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:35 +0100 Subject: [PATCH 213/989] efi: Avoid cold plugged memory for placing the kernel UEFI 2.11 introduced EFI_MEMORY_HOT_PLUGGABLE to annotate system memory regions that are 'cold plugged' at boot, i.e., hot pluggable memory that is available from early boot, and described as system RAM by the firmware. Existing loaders and EFI applications running in the boot context will happily use this memory for allocating data structures that cannot be freed or moved at runtime, and this prevents the memory from being unplugged. Going forward, the new EFI_MEMORY_HOT_PLUGGABLE attribute should be tested, and memory annotated as such should be avoided for such allocations. In the EFI stub, there are a couple of occurrences where, instead of the high-level AllocatePages() UEFI boot service, a low-level code sequence is used that traverses the EFI memory map and carves out the requested number of pages from a free region. This is needed, e.g., for allocating as low as possible, or for allocating pages at random. While AllocatePages() should presumably avoid special purpose memory and cold plugged regions, this manual approach needs to incorporate this logic itself, in order to prevent the kernel itself from ending up in a hot unpluggable region, preventing it from being unplugged. So add the EFI_MEMORY_HOTPLUGGABLE macro definition, and check for it where appropriate. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 6 ++++-- drivers/firmware/efi/libstub/randomalloc.c | 3 +++ drivers/firmware/efi/libstub/relocate.c | 3 +++ include/linux/efi.h | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 8296bf985d1d1..7309394b8fc98 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -934,13 +934,15 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO | EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_NV | EFI_MEMORY_SP | EFI_MEMORY_CPU_CRYPTO | - EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE)) + EFI_MEMORY_MORE_RELIABLE | EFI_MEMORY_HOT_PLUGGABLE | + EFI_MEMORY_RUNTIME)) snprintf(pos, size, "|attr=0x%016llx]", (unsigned long long)attr); else snprintf(pos, size, - "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_HOT_PLUGGABLE ? "HP" : "", attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "", attr & EFI_MEMORY_CPU_CRYPTO ? "CC" : "", attr & EFI_MEMORY_SP ? "SP" : "", diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index e5872e38d9a46..5a732018be36d 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -25,6 +25,9 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, if (md->type != EFI_CONVENTIONAL_MEMORY) return 0; + if (md->attribute & EFI_MEMORY_HOT_PLUGGABLE) + return 0; + if (efi_soft_reserve_enabled() && (md->attribute & EFI_MEMORY_SP)) return 0; diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c index 99b45d1cd6246..d4264bfb6dc17 100644 --- a/drivers/firmware/efi/libstub/relocate.c +++ b/drivers/firmware/efi/libstub/relocate.c @@ -53,6 +53,9 @@ efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; + if (desc->attribute & EFI_MEMORY_HOT_PLUGGABLE) + continue; + if (efi_soft_reserve_enabled() && (desc->attribute & EFI_MEMORY_SP)) continue; diff --git a/include/linux/efi.h b/include/linux/efi.h index 053c57e618698..db293d7de6864 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -128,6 +128,7 @@ typedef struct { #define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ #define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ #define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 -- GitLab From bbc4578537e350d5bf8a7a2c7d054d6b163b3c41 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:36 +0100 Subject: [PATCH 214/989] efi: Use BIT_ULL() constants for memory attributes For legibility, use the existing BIT_ULL() to generate the u64 type EFI memory attribute macros. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/efi.h b/include/linux/efi.h index db293d7de6864..7d63d1d75f22f 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -114,22 +114,22 @@ typedef struct { #define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ -#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -#define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ -#define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ -#define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ -#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ -#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ -#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ -#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -#define EFI_MEMORY_NV ((u64)0x0000000000008000ULL) /* non-volatile */ -#define EFI_MEMORY_MORE_RELIABLE \ - ((u64)0x0000000000010000ULL) /* higher reliability */ -#define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ -#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ -#define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_UC BIT_ULL(0) /* uncached */ +#define EFI_MEMORY_WC BIT_ULL(1) /* write-coalescing */ +#define EFI_MEMORY_WT BIT_ULL(2) /* write-through */ +#define EFI_MEMORY_WB BIT_ULL(3) /* write-back */ +#define EFI_MEMORY_UCE BIT_ULL(4) /* uncached, exported */ +#define EFI_MEMORY_WP BIT_ULL(12) /* write-protect */ +#define EFI_MEMORY_RP BIT_ULL(13) /* read-protect */ +#define EFI_MEMORY_XP BIT_ULL(14) /* execute-protect */ +#define EFI_MEMORY_NV BIT_ULL(15) /* non-volatile */ +#define EFI_MEMORY_MORE_RELIABLE BIT_ULL(16) /* higher reliability */ +#define EFI_MEMORY_RO BIT_ULL(17) /* read-only */ +#define EFI_MEMORY_SP BIT_ULL(18) /* soft reserved */ +#define EFI_MEMORY_CPU_CRYPTO BIT_ULL(19) /* supports encryption */ #define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ -#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ +#define EFI_MEMORY_RUNTIME BIT_ULL(63) /* range requires runtime mapping */ + #define EFI_MEMORY_DESCRIPTOR_VERSION 1 #define EFI_PAGE_SHIFT 12 -- GitLab From e8ed246ded863eb862806c5591afdcf70012ab5e Mon Sep 17 00:00:00 2001 From: Andre Werner Date: Tue, 21 Jan 2025 08:18:19 +0100 Subject: [PATCH 215/989] serial: sc16is7xx: Fix IRQ number check behavior The logical meaning of the previous version is wrong due to a typo. If the IRQ equals 0, no interrupt pin is available and polling mode shall be used. Additionally, this fix adds a check for IRQ < 0 to increase robustness, because documentation still says that negative IRQ values cannot be absolutely ruled-out. Fixes: 104c1b9dde9d ("serial: sc16is7xx: Add polling mode if no IRQ pin is available") Signed-off-by: Andre Werner Reviewed-by: Jiri Slaby Reviewed-by: Andy Shevchenko Reviewed-by: Maarten Brock Reviewed-by: Hugo Villeneuve Link: https://lore.kernel.org/r/20250121071819.1346672-1-andre.werner@systec-electronic.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sc16is7xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index 7b51cdc274fd8..560f45ed19aeb 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1561,7 +1561,7 @@ int sc16is7xx_probe(struct device *dev, const struct sc16is7xx_devtype *devtype, /* Always ask for fixed clock rate from a property. */ device_property_read_u32(dev, "clock-frequency", &uartclk); - s->polling = !!irq; + s->polling = (irq <= 0); if (s->polling) dev_dbg(dev, "No interrupt pin definition, falling back to polling mode\n"); -- GitLab From 166ac2bba167d575e7146beaa66093bc7c072f43 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:46 +0200 Subject: [PATCH 216/989] serial: port: Assign ->iotype correctly when ->iobase is set Currently the ->iotype is always assigned to the UPIO_MEM when the respective property is not found. However, this will not support the cases when user wants to have UPIO_PORT to be set or preserved. Support this scenario by checking ->iobase value and default the ->iotype respectively. Fixes: 1117a6fdc7c1 ("serial: 8250_of: Switch to use uart_read_port_properties()") Fixes: e894b6005dce ("serial: port: Introduce a common helper to read properties") Cc: stable Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index d35f1d24156c2..f28d0633fe6bd 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -173,6 +173,7 @@ EXPORT_SYMBOL(uart_remove_one_port); * The caller is responsible to initialize the following fields of the @port * ->dev (must be valid) * ->flags + * ->iobase * ->mapbase * ->mapsize * ->regshift (if @use_defaults is false) @@ -214,7 +215,7 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) /* Read the registers I/O access type (default: MMIO 8-bit) */ ret = device_property_read_u32(dev, "reg-io-width", &value); if (ret) { - port->iotype = UPIO_MEM; + port->iotype = port->iobase ? UPIO_PORT : UPIO_MEM; } else { switch (value) { case 1: -- GitLab From e8486bd50ecf63c9a1e25271f258a8d959f2672f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:47 +0200 Subject: [PATCH 217/989] serial: port: Always update ->iotype in __uart_read_properties() The documentation of the __uart_read_properties() states that ->iotype member is always altered after the function call, but the code doesn't do that in the case when use_defaults == false and the value of reg-io-width is unsupported. Make sure the code follows the documentation. Note, the current users of the uart_read_and_validate_port_properties() will fail and the change doesn't affect their behaviour, neither users of uart_read_port_properties() will be affected since the alteration happens there even in the current code flow. Fixes: e894b6005dce ("serial: port: Introduce a common helper to read properties") Cc: stable Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index f28d0633fe6bd..85285c56fabff 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -228,11 +228,11 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) port->iotype = device_is_big_endian(dev) ? UPIO_MEM32BE : UPIO_MEM32; break; default: + port->iotype = UPIO_UNKNOWN; if (!use_defaults) { dev_err(dev, "Unsupported reg-io-width (%u)\n", value); return -EINVAL; } - port->iotype = UPIO_UNKNOWN; break; } } -- GitLab From 12397549b5014071e1d2b315509f68eb93ef9144 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:48 +0200 Subject: [PATCH 218/989] serial: port: Make ->iotype validation global in __uart_read_properties() In order to make code robust against potential changes in the future move ->iotype validation outside of switch in __uart_read_properties(). If any code will be added in between that might leave the ->iotype value unknown the validation catches this up. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-4-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_port.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c index 85285c56fabff..2fc48cd63f6cf 100644 --- a/drivers/tty/serial/serial_port.c +++ b/drivers/tty/serial/serial_port.c @@ -229,14 +229,15 @@ static int __uart_read_properties(struct uart_port *port, bool use_defaults) break; default: port->iotype = UPIO_UNKNOWN; - if (!use_defaults) { - dev_err(dev, "Unsupported reg-io-width (%u)\n", value); - return -EINVAL; - } break; } } + if (!use_defaults && port->iotype == UPIO_UNKNOWN) { + dev_err(dev, "Unsupported reg-io-width (%u)\n", value); + return -EINVAL; + } + /* Read the address mapping base offset (default: no offset) */ ret = device_property_read_u32(dev, "reg-offset", &value); if (ret) -- GitLab From fe310f75327edbc042c7cc0df32c6b9ec29eb93a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:49 +0200 Subject: [PATCH 219/989] serial: 8250_of: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-5-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_of.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index 64aed7efc5697..11c860ea80f60 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -110,7 +110,6 @@ static int of_platform_serial_setup(struct platform_device *ofdev, spin_lock_init(&port->lock); if (resource_type(&resource) == IORESOURCE_IO) { - port->iotype = UPIO_PORT; port->iobase = resource.start; } else { port->mapbase = resource.start; -- GitLab From 34bbb5d5137f32be3186a995a5ad4c60aaad11a7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:50 +0200 Subject: [PATCH 220/989] serial: 8250_platform: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Otherwise it will be UPIO_MEM. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-6-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_platform.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/tty/serial/8250/8250_platform.c b/drivers/tty/serial/8250/8250_platform.c index 8bdc1879d952b..c0343bfb80647 100644 --- a/drivers/tty/serial/8250/8250_platform.c +++ b/drivers/tty/serial/8250/8250_platform.c @@ -112,7 +112,6 @@ static int serial8250_probe_acpi(struct platform_device *pdev) struct device *dev = &pdev->dev; struct uart_8250_port uart = { }; struct resource *regs; - unsigned char iotype; int ret, line; regs = platform_get_mem_or_io(pdev, 0); @@ -122,13 +121,11 @@ static int serial8250_probe_acpi(struct platform_device *pdev) switch (resource_type(regs)) { case IORESOURCE_IO: uart.port.iobase = regs->start; - iotype = UPIO_PORT; break; case IORESOURCE_MEM: uart.port.mapbase = regs->start; uart.port.mapsize = resource_size(regs); uart.port.flags = UPF_IOREMAP; - iotype = UPIO_MEM; break; default: return -EINVAL; @@ -147,12 +144,6 @@ static int serial8250_probe_acpi(struct platform_device *pdev) if (ret) return ret; - /* - * The previous call may not set iotype correctly when reg-io-width - * property is absent and it doesn't support IO port resource. - */ - uart.port.iotype = iotype; - line = serial8250_register_8250_port(&uart); if (line < 0) return line; -- GitLab From 0f3fd9cf6491f5beecbb65abb41556c56135340c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 24 Jan 2025 18:10:51 +0200 Subject: [PATCH 221/989] serial: 8250_pnp: Remove unneeded ->iotype assignment If ->iobase is set the default will be UPIO_PORT for ->iotype after the uart_read_and_validate_port_properties() call. Hence no need to assign that explicitly. Otherwise it will be UPIO_MEM. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250124161530.398361-7-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_pnp.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/tty/serial/8250/8250_pnp.c b/drivers/tty/serial/8250/8250_pnp.c index 7c06ae79d8e23..7a837fdf9df13 100644 --- a/drivers/tty/serial/8250/8250_pnp.c +++ b/drivers/tty/serial/8250/8250_pnp.c @@ -436,7 +436,6 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) { struct uart_8250_port uart, *port; int ret, flags = dev_id->driver_data; - unsigned char iotype; long line; if (flags & UNKNOWN_DEV) { @@ -448,14 +447,11 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) memset(&uart, 0, sizeof(uart)); if ((flags & CIR_PORT) && pnp_port_valid(dev, 2)) { uart.port.iobase = pnp_port_start(dev, 2); - iotype = UPIO_PORT; } else if (pnp_port_valid(dev, 0)) { uart.port.iobase = pnp_port_start(dev, 0); - iotype = UPIO_PORT; } else if (pnp_mem_valid(dev, 0)) { uart.port.mapbase = pnp_mem_start(dev, 0); uart.port.mapsize = pnp_mem_len(dev, 0); - iotype = UPIO_MEM; uart.port.flags = UPF_IOREMAP; } else return -ENODEV; @@ -471,12 +467,6 @@ serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id) if (ret) return ret; - /* - * The previous call may not set iotype correctly when reg-io-width - * property is absent and it doesn't support IO port resource. - */ - uart.port.iotype = iotype; - if (flags & CIR_PORT) { uart.port.flags |= UPF_FIXED_PORT | UPF_FIXED_TYPE; uart.port.type = PORT_8250_CIR; -- GitLab From 4241a702e0d0c2ca9364cfac08dbf134264962de Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Feb 2025 11:03:04 +0000 Subject: [PATCH 222/989] rxrpc: Fix the rxrpc_connection attend queue handling The rxrpc_connection attend queue is never used because conn::attend_link is never initialised and so is always NULL'd out and thus always appears to be busy. This requires the following fix: (1) Fix this the attend queue problem by initialising conn::attend_link. And, consequently, two further fixes for things masked by the above bug: (2) Fix rxrpc_input_conn_event() to handle being invoked with a NULL sk_buff pointer - something that can now happen with the above change. (3) Fix the RXRPC_SKB_MARK_SERVICE_CONN_SECURED message to carry a pointer to the connection and a ref on it. Signed-off-by: David Howells cc: Marc Dionne cc: Jakub Kicinski cc: "David S. Miller" cc: Eric Dumazet cc: Paolo Abeni cc: Simon Horman cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org Fixes: f2cce89a074e ("rxrpc: Implement a mechanism to send an event notification to a connection") Link: https://patch.msgid.link/20250203110307.7265-3-dhowells@redhat.com Signed-off-by: Paolo Abeni --- include/trace/events/rxrpc.h | 1 + net/rxrpc/conn_event.c | 17 ++++++++++------- net/rxrpc/conn_object.c | 1 + 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2f119d18a061f..cad50d91077ef 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -219,6 +219,7 @@ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 713e04394ceb7..74bb49b936cd4 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -272,6 +272,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ + sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -437,14 +438,16 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 7eba4d7d9a380..2f1fd1e2e7e48 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -67,6 +67,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); -- GitLab From 5417a2e9b130a78bf48cb4cf92630efcee5ccf38 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 14:55:54 +0000 Subject: [PATCH 223/989] KVM: arm64: Fix nested S2 MMU structures reallocation For each vcpu that userspace creates, we allocate a number of s2_mmu structures that will eventually contain our shadow S2 page tables. Since this is a dynamically allocated array, we reallocate the array and initialise the newly allocated elements. Once everything is correctly initialised, we adjust pointer and size in the kvm structure, and move on. But should that initialisation fail *and* the reallocation triggered a copy to another location, we end-up returning early, with the kvm structure still containing the (now stale) old pointer. Weeee! Cure it by assigning the pointer early, and use this to perform the initialisation. If everything succeeds, we adjust the size. Otherwise, we just leave the size as it was, no harm done, and the new memory is as good as the ol' one (we hope...). Fixes: 4f128f8e1aaac ("KVM: arm64: nv: Support multiple nested Stage-2 mmu structures") Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Link: https://lore.kernel.org/r/20250204145554.774427-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 33d2ace686658..0c9387d2f5070 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -67,26 +67,27 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) if (!tmp) return -ENOMEM; + swap(kvm->arch.nested_mmus, tmp); + /* * If we went through a realocation, adjust the MMU back-pointers in * the previously initialised kvm_pgtable structures. */ if (kvm->arch.nested_mmus != tmp) for (int i = 0; i < kvm->arch.nested_mmus_size; i++) - tmp[i].pgt->mmu = &tmp[i]; + kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i]; for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++) - ret = init_nested_s2_mmu(kvm, &tmp[i]); + ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]); if (ret) { for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++) - kvm_free_stage2_pgd(&tmp[i]); + kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]); return ret; } kvm->arch.nested_mmus_size = num_mmus; - kvm->arch.nested_mmus = tmp; return 0; } -- GitLab From b450dcce93bc2cf6d2bfaf5a0de88a94ebad8f89 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:48 +0000 Subject: [PATCH 224/989] KVM: arm64: timer: Always evaluate the need for a soft timer When updating the interrupt state for an emulated timer, we return early and skip the setup of a soft timer that runs in parallel with the guest. While this is OK if we have set the interrupt pending, it is pretty wrong if the guest moved CVAL into the future. In that case, no timer is armed and the guest can wait for a very long time (it will take a full put/load cycle for the situation to resolve). This is specially visible with EDK2 running at EL2, but still using the EL1 virtual timer, which in that case is fully emulated. Any key-press takes ages to be captured, as there is no UART interrupt and EDK2 relies on polling from a timer... The fix is simply to drop the early return. If the timer interrupt is pending, we will still return early, and otherwise arm the soft timer. Fixes: 4d74ecfa6458b ("KVM: arm64: Don't arm a hrtimer for an already pending timer") Cc: stable@vger.kernel.org Tested-by: Dmytro Terletskyi Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index d3d243366536c..035e43f5d4f9a 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx) trace_kvm_timer_emulate(ctx, should_fire); - if (should_fire != ctx->irq.level) { + if (should_fire != ctx->irq.level) kvm_timer_update_irq(ctx->vcpu, should_fire, ctx); - return; - } kvm_timer_update_status(ctx, should_fire); -- GitLab From 1b8705ad5365b5333240b46d5cd24e88ef2ddb14 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:49 +0000 Subject: [PATCH 225/989] KVM: arm64: timer: Correctly handle EL1 timer emulation when !FEAT_ECV Both Wei-Lin Chang and Volodymyr Babchuk report that the way we handle the emulation of EL1 timers with NV is completely wrong, specially in the case of HCR_EL2.E2H==0. There are three problems in about as many lines of code: - With E2H==0, the EL1 timers are overwritten with the EL1 state, while they should actually contain the EL2 state (as per the timer map) - With E2H==1, we run the full EL1 timer emulation even when ECV is present, hiding a bug in timer_emulate() (see previous patch) - The comments are actively misleading, and say all the wrong things. This is only attributable to the code having been initially written for FEAT_NV, hacked up to handle FEAT_NV2 *in parallel*, and vaguely hacked again to be FEAT_NV2 only. Oh, and yours truly being a gold plated idiot. The fix is obvious: just delete most of the E2H==0 code, have a unified handling of the timers (because they really are E2H agnostic), and make sure we don't execute any of that when FEAT_ECV is present. Fixes: 4bad3068cfa9f ("KVM: arm64: nv: Sync nested timer state with FEAT_NV2") Reported-by: Wei-Lin Chang Reported-by: Volodymyr Babchuk Link: https://lore.kernel.org/r/fqiqfjzwpgbzdtouu2pwqlu7llhnf5lmy4hzv5vo6ph4v3vyls@jdcfy3fjjc5k Link: https://lore.kernel.org/r/87frl51tse.fsf@epam.com Tested-by: Dmytro Terletskyi Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 035e43f5d4f9a..e59836e0260cf 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -974,31 +974,21 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) * which allows trapping of the timer registers even with NV2. * Still, this is still worse than FEAT_NV on its own. Meh. */ - if (!vcpu_el2_e2h_is_set(vcpu)) { - if (cpus_have_final_cap(ARM64_HAS_ECV)) - return; - - /* - * A non-VHE guest hypervisor doesn't have any direct access - * to its timers: the EL2 registers trap (and the HW is - * fully emulated), while the EL0 registers access memory - * despite the access being notionally direct. Boo. - * - * We update the hardware timer registers with the - * latest value written by the guest to the VNCR page - * and let the hardware take care of the rest. - */ - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL); - write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL); - } else { + if (!cpus_have_final_cap(ARM64_HAS_ECV)) { /* * For a VHE guest hypervisor, the EL2 state is directly - * stored in the host EL1 timers, while the emulated EL0 + * stored in the host EL1 timers, while the emulated EL1 * state is stored in the VNCR page. The latter could have * been updated behind our back, and we must reset the * emulation of the timers. + * + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap despite being + * notionally direct (we use the EL1 HW, as for VHE), while + * the EL1 registers access memory. + * + * In both cases, process the emulated timers on each guest + * exit. Boo. */ struct timer_map map; get_timer_map(vcpu, &map); -- GitLab From 0e459810285503fb354537e84049e212c5917c33 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 4 Feb 2025 11:00:50 +0000 Subject: [PATCH 226/989] KVM: arm64: timer: Don't adjust the EL2 virtual timer offset The way we deal with the EL2 virtual timer is a bit odd. We try to cope with E2H being flipped, and adjust which offset applies to that timer depending on the current E2H value. But that's a complexity we shouldn't have to worry about. What we have to deal with is either E2H being RES1, in which case there is no offset, or E2H being RES0, and the virtual timer simply does not exist. Drop the adjusting of the timer offset, which makes things a bit simpler. At the same time, make sure that accessing the HV timer when E2H is RES0 results in an UNDEF in the guest. Suggested-by: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250204110050.150560-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 15 --------------- arch/arm64/kvm/sys_regs.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index e59836e0260cf..231c0cd9c7b4b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -759,21 +759,6 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, timer_irq(map->direct_ptimer), &arch_timer_irq_ops); WARN_ON_ONCE(ret); - - /* - * The virtual offset behaviour is "interesting", as it - * always applies when HCR_EL2.E2H==0, but only when - * accessed from EL1 when HCR_EL2.E2H==1. So make sure we - * track E2H when putting the HV timer in "direct" mode. - */ - if (map->direct_vtimer == vcpu_hvtimer(vcpu)) { - struct arch_timer_offset *offs = &map->direct_vtimer->offset; - - if (vcpu_el2_e2h_is_set(vcpu)) - offs->vcpu_offset = NULL; - else - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); - } } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 526d66f24e34a..7968bee0d27ea 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1452,6 +1452,16 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } +static bool access_hv_timer(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!vcpu_el2_e2h_is_set(vcpu)) + return undef_access(vcpu, p, r); + + return access_arch_timer(vcpu, p, r); +} + static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -3099,9 +3109,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), - { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, - EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), - EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer }, + EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, -- GitLab From ee3a66f431d689b796b9cb48aefd3d223540381c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 4 Feb 2025 11:13:24 -0500 Subject: [PATCH 227/989] kvm: x86: SRSO_USER_KERNEL_NO is not synthesized SYNTHESIZED_F() generally is used together with setup_force_cpu_cap(), i.e. when it makes sense to present the feature even if cpuid does not have it *and* the VM is not able to see the difference. For example, it can be used when mitigations on the host automatically protect the guest as well. The "SYNTHESIZED_F(SRSO_USER_KERNEL_NO)" line came in as a conflict resolution between the CPUID overhaul from the KVM tree and support for the feature in the x86 tree. Using it right now does not hurt, or make a difference for that matter, because there is no setup_force_cpu_cap(X86_FEATURE_SRSO_USER_KERNEL_NO). However, it is a little less future proof in case such a setup_force_cpu_cap() appears later, for a case where the kernel somehow is not vulnerable but the guest would have to apply the mitigation. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2cbb3874ad398..8eb3a88707f21 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1180,7 +1180,7 @@ void kvm_set_cpu_caps(void) SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), - SYNTHESIZED_F(SRSO_USER_KERNEL_NO), + F(SRSO_USER_KERNEL_NO), ); kvm_cpu_cap_init(CPUID_8000_0022_EAX, -- GitLab From 203a53029a9c6935ad38e0343d048e51488797cf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 4 Feb 2025 10:56:47 +0000 Subject: [PATCH 228/989] KVM: selftests: Fix spelling mistake "initally" -> "initially" There is a spelling mistake in a literal string and in the function test_get_inital_dirty. Fix them. Signed-off-by: Colin Ian King Message-ID: <20250204105647.367743-1-colin.i.king@gmail.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/s390/cmma_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/s390/cmma_test.c b/tools/testing/selftests/kvm/s390/cmma_test.c index e32dd59703a08..85cc8c18d6e70 100644 --- a/tools/testing/selftests/kvm/s390/cmma_test.c +++ b/tools/testing/selftests/kvm/s390/cmma_test.c @@ -444,7 +444,7 @@ static void assert_no_pages_cmma_dirty(struct kvm_vm *vm) ); } -static void test_get_inital_dirty(void) +static void test_get_initial_dirty(void) { struct kvm_vm *vm = create_vm_two_memslots(); struct kvm_vcpu *vcpu; @@ -651,7 +651,7 @@ struct testdef { } testlist[] = { { "migration mode and dirty tracking", test_migration_mode }, { "GET_CMMA_BITS: basic calls", test_get_cmma_basic }, - { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty }, + { "GET_CMMA_BITS: all pages are dirty initially", test_get_initial_dirty }, { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes }, }; -- GitLab From 6f61269495260531e15d84d090ee63618110c470 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 24 Jan 2025 10:26:22 -0500 Subject: [PATCH 229/989] KVM: remove kvm_arch_post_init_vm The only statement in a kvm_arch_post_init_vm implementation can be moved into the x86 kvm_arch_init_vm. Do so and remove all traces from architecture-independent code. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +------ include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 15 --------------- 3 files changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d4a6734b2d69..8e77e61d4fbd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12741,6 +12741,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); } + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: @@ -12750,12 +12751,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - once_init(&kvm->arch.nx_once); - return 0; -} - static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3cb9a32a6330e..f34f4cfaa5134 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1615,7 +1615,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); -int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3f04cd5e3a8cf..ba0327e2d0d33 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1070,15 +1070,6 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname) return ret; } -/* - * Called after the VM is otherwise initialized, but just before adding it to - * the vm_list. - */ -int __weak kvm_arch_post_init_vm(struct kvm *kvm) -{ - return 0; -} - /* * Called just after removing the VM from the vm_list, but before doing any * other destruction. @@ -1199,10 +1190,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_debugfs; - r = kvm_arch_post_init_vm(kvm); - if (r) - goto out_err; - mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); mutex_unlock(&kvm_lock); @@ -1212,8 +1199,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) return kvm; -out_err: - kvm_destroy_vm_debugfs(kvm); out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: -- GitLab From 43fb96ae78551d7bfa4ecca956b258f085d67c40 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jan 2025 15:46:23 -0800 Subject: [PATCH 230/989] KVM: x86/mmu: Ensure NX huge page recovery thread is alive before waking When waking a VM's NX huge page recovery thread, ensure the thread is actually alive before trying to wake it. Now that the thread is spawned on-demand during KVM_RUN, a VM without a recovery thread is reachable via the related module params. BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:vhost_task_wake+0x5/0x10 Call Trace: set_nx_huge_pages+0xcc/0x1e0 [kvm] param_attr_store+0x8a/0xd0 module_attr_store+0x1a/0x30 kernfs_fop_write_iter+0x12f/0x1e0 vfs_write+0x233/0x3e0 ksys_write+0x60/0xd0 do_syscall_64+0x5b/0x160 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f3b52710104 Modules linked in: kvm_intel kvm CR2: 0000000000000040 Fixes: 931656b9e2ff ("kvm: defer huge page recovery vhost task to later") Cc: stable@vger.kernel.org Cc: Keith Busch Signed-off-by: Sean Christopherson Message-ID: <20250124234623.3609069-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a45ae60e84ab4..74c20dbb92dae 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7120,6 +7120,19 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) +{ + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); + + if (nx_thread) + vhost_task_wake(nx_thread); +} + static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) @@ -7180,7 +7193,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -7315,7 +7328,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -7451,14 +7464,20 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); - kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( - kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, - kvm, "kvm-nx-lpage-recovery"); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - if (kvm->arch.nx_huge_page_recovery_thread) - vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + if (!nx_thread) + return; + + vhost_task_start(nx_thread); + + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); } int kvm_mmu_post_init_vm(struct kvm *kvm) -- GitLab From 2a03d2da55b4cd5c86b360db0e917ee93b3f0cb9 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 9 Jan 2025 06:35:49 +0200 Subject: [PATCH 231/989] dt-bindings: nvmem: qcom,qfprom: Add SAR2130P compatible Document compatible for the QFPROM on SAR2130P platform. Signed-off-by: Dmitry Baryshkov Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250109-sar2130p-nvmem-v4-5-633739fe5f11@linaro.org Signed-off-by: Rob Herring (Arm) --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index d37f544ab8aa3..39c209249c9c0 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -36,6 +36,7 @@ properties: - qcom,qcs404-qfprom - qcom,qcs615-qfprom - qcom,qcs8300-qfprom + - qcom,sar2130p-qfprom - qcom,sc7180-qfprom - qcom,sc7280-qfprom - qcom,sc8280xp-qfprom -- GitLab From 230b19bc2bcc5897d0e20b4ce7e9790a469a2db0 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 31 Jan 2025 14:49:54 +0200 Subject: [PATCH 232/989] drm/i915/dp: Iterate DSC BPP from high to low on all platforms Commit 1c56e9a39833 ("drm/i915/dp: Get optimal link config to have best compressed bpp") tries to find the best compressed bpp for the link. However, it iterates from max to min bpp on display 13+, and from min to max on other platforms. This presumably leads to minimum compressed bpp always being chosen on display 11-12. Iterate from high to low on all platforms to actually use the best possible compressed bpp. Fixes: 1c56e9a39833 ("drm/i915/dp: Get optimal link config to have best compressed bpp") Cc: Ankit Nautiyal Cc: Imre Deak Cc: # v6.7+ Reviewed-by: Imre Deak Reviewed-by: Ankit Nautiyal Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/3bba67923cbcd13a59d26ef5fa4bb042b13c8a9b.1738327620.git.jani.nikula@intel.com (cherry picked from commit 56b0337d429356c3b9ecc36a03023c8cc856b196) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index be07034bfcc69..aa77ddcee42c8 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -2072,11 +2072,10 @@ icl_dsc_compute_link_config(struct intel_dp *intel_dp, /* Compressed BPP should be less than the Input DSC bpp */ dsc_max_bpp = min(dsc_max_bpp, pipe_bpp - 1); - for (i = 0; i < ARRAY_SIZE(valid_dsc_bpp); i++) { - if (valid_dsc_bpp[i] < dsc_min_bpp) + for (i = ARRAY_SIZE(valid_dsc_bpp) - 1; i >= 0; i--) { + if (valid_dsc_bpp[i] < dsc_min_bpp || + valid_dsc_bpp[i] > dsc_max_bpp) continue; - if (valid_dsc_bpp[i] > dsc_max_bpp) - break; ret = dsc_compute_link_config(intel_dp, pipe_config, -- GitLab From ecee4d0695067ae04959b121028b42a588e75370 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 4 Feb 2025 11:40:25 -0600 Subject: [PATCH 233/989] accel/amdxdna: Add MODULE_FIRMWARE() declarations Initramfs building tools such as dracut will look for a MODULE_FIRMWARE() declaration to determine which firmware to include in the initramfs when a driver is included in the initramfs. As amdxdna doesn't declare any firmware this causes the driver to fail to load with -ENOENT when in the initramfs. Add the missing declaration for possible firmware. Reported-by: Renjith Pananchikkal Suggested-by: Alexander Deucher Fixes: 8c9ff1b181ba ("accel/amdxdna: Add a new driver for AMD AI Engine") Reviewed-by: Lizhi Hou Link: https://lore.kernel.org/r/20250204174031.3425762-1-superm1@kernel.org Signed-off-by: Mario Limonciello Link: https://patchwork.freedesktop.org/patch/msgid/20250204174031.3425762-1-superm1@kernel.org --- drivers/accel/amdxdna/amdxdna_pci_drv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c index 97d4a032171f1..f5b8497cf5ad6 100644 --- a/drivers/accel/amdxdna/amdxdna_pci_drv.c +++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c @@ -21,6 +21,11 @@ #define AMDXDNA_AUTOSUSPEND_DELAY 5000 /* milliseconds */ +MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_11/npu.sbin"); +MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin"); + /* * Bind the driver base on (vendor_id, device_id) pair and later use the * (device_id, rev_id) pair as a key to select the devices. The devices with -- GitLab From aff2355d260e47e780cd96af127beaab18a664b1 Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Tue, 4 Feb 2025 19:45:06 +0200 Subject: [PATCH 234/989] spi: pxa2xx: Fix regression when toggling chip select on LPSS devices The commit 78b435c9044a ("spi: pxa2xx: Introduce __lpss_ssp_update_priv() helper") broke speaker output on my ASUS UX5304MA laptop. The problem is in inverted value that got written in the private register. Simple bug, simple fix. Fixes: 78b435c9044a ("spi: pxa2xx: Introduce __lpss_ssp_update_priv() helper") Signed-off-by: Mark Lord Tested-by: Mark Lord Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250204174506.149978-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-pxa2xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index 5f9cac41baff6..06711a62fa3dc 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -399,7 +399,7 @@ static void lpss_ssp_cs_control(struct spi_device *spi, bool enable) lpss_ssp_select_cs(spi, config); mask = LPSS_CS_CONTROL_CS_HIGH; - __lpss_ssp_update_priv(drv_data, config->reg_cs_ctrl, mask, enable ? mask : 0); + __lpss_ssp_update_priv(drv_data, config->reg_cs_ctrl, mask, enable ? 0 : mask); if (config->cs_clk_stays_gated) { /* * Changing CS alone when dynamic clock gating is on won't -- GitLab From fd079124112c6e11c1bca2e7c71470a2d60bc363 Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Wed, 5 Feb 2025 00:59:53 +0530 Subject: [PATCH 235/989] selftests/cgroup: use bash in test_cpuset_v1_hp.sh The script uses non-POSIX features like `[[` for conditionals and hence does not work when run with a POSIX /bin/sh. Change the shebang to /bin/bash instead, like the other tests in cgroup. Signed-off-by: Bharadwaj Raju Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh index 3f45512fb512e..7406c24be1ac9 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_hp.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 # # Test the special cpuset v1 hotplug case where a cpuset become empty of -- GitLab From d6179f6c6204f9932aed3a7a2100b4a295dfed9d Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Thu, 6 Jun 2024 15:31:02 +1200 Subject: [PATCH 236/989] gpio: pca953x: Improve interrupt support The GPIO drivers with latch interrupt support (typically types starting with PCAL) have interrupt status registers to determine which particular inputs have caused an interrupt. Unfortunately there is no atomic operation to read these registers and clear the interrupt. Clearing the interrupt is done by reading the input registers. The code was reading the interrupt status registers, and then reading the input registers. If an input changed between these two events it was lost. The solution in this patch is to revert to the non-latch version of code, i.e. remembering the previous input status, and looking for the changes. This system results in no more I2C transfers, so is no slower. The latch property of the device still means interrupts will still be noticed if the input changes back to its initial state. Fixes: 44896beae605 ("gpio: pca953x: add PCAL9535 interrupt support for Galileo Gen2") Signed-off-by: Mark Tomlinson Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240606033102.2271916-1-mark.tomlinson@alliedtelesis.co.nz Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index be4c9981ebc40..d63c1030e6ac0 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -841,25 +841,6 @@ static bool pca953x_irq_pending(struct pca953x_chip *chip, unsigned long *pendin DECLARE_BITMAP(trigger, MAX_LINE); int ret; - if (chip->driver_data & PCA_PCAL) { - /* Read the current interrupt status from the device */ - ret = pca953x_read_regs(chip, PCAL953X_INT_STAT, trigger); - if (ret) - return false; - - /* Check latched inputs and clear interrupt status */ - ret = pca953x_read_regs(chip, chip->regs->input, cur_stat); - if (ret) - return false; - - /* Apply filter for rising/falling edge selection */ - bitmap_replace(new_stat, chip->irq_trig_fall, chip->irq_trig_raise, cur_stat, gc->ngpio); - - bitmap_and(pending, new_stat, trigger, gc->ngpio); - - return !bitmap_empty(pending, gc->ngpio); - } - ret = pca953x_read_regs(chip, chip->regs->input, cur_stat); if (ret) return false; -- GitLab From 028676bb189ed6d1b550a0fc570a9d695b6acfd3 Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Mon, 3 Feb 2025 09:36:05 -0500 Subject: [PATCH 237/989] net: atlantic: fix warning during hot unplug Firmware deinitialization performs MMIO accesses which are not necessary if the device has already been removed. In some cases, these accesses happen via readx_poll_timeout_atomic which ends up timing out, resulting in a warning at hw_atl2_utils_fw.c:112: [ 104.595913] Call Trace: [ 104.595915] [ 104.595918] ? show_regs+0x6c/0x80 [ 104.595923] ? __warn+0x8d/0x150 [ 104.595925] ? aq_a2_fw_deinit+0xcf/0xe0 [atlantic] [ 104.595934] ? report_bug+0x182/0x1b0 [ 104.595938] ? handle_bug+0x6e/0xb0 [ 104.595940] ? exc_invalid_op+0x18/0x80 [ 104.595942] ? asm_exc_invalid_op+0x1b/0x20 [ 104.595944] ? aq_a2_fw_deinit+0xcf/0xe0 [atlantic] [ 104.595952] ? aq_a2_fw_deinit+0xcf/0xe0 [atlantic] [ 104.595959] aq_nic_deinit.part.0+0xbd/0xf0 [atlantic] [ 104.595964] aq_nic_deinit+0x17/0x30 [atlantic] [ 104.595970] aq_ndev_close+0x2b/0x40 [atlantic] [ 104.595975] __dev_close_many+0xad/0x160 [ 104.595978] dev_close_many+0x99/0x170 [ 104.595979] unregister_netdevice_many_notify+0x18b/0xb20 [ 104.595981] ? __call_rcu_common+0xcd/0x700 [ 104.595984] unregister_netdevice_queue+0xc6/0x110 [ 104.595986] unregister_netdev+0x1c/0x30 [ 104.595988] aq_pci_remove+0xb1/0xc0 [atlantic] Fix this by skipping firmware deinitialization altogether if the PCI device is no longer present. Tested with an AQC113 attached via Thunderbolt by performing repeated unplug cycles while traffic was running via iperf. Fixes: 97bde5c4f909 ("net: ethernet: aquantia: Support for NIC-specific code") Signed-off-by: Jacob Moroni Reviewed-by: Igor Russkikh Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250203143604.24930-3-mail@jakemoroni.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index fe0e3e2a81171..71e50fc65c147 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -1441,7 +1441,9 @@ void aq_nic_deinit(struct aq_nic_s *self, bool link_down) aq_ptp_ring_free(self); aq_ptp_free(self); - if (likely(self->aq_fw_ops->deinit) && link_down) { + /* May be invoked during hot unplug. */ + if (pci_device_is_present(self->pdev) && + likely(self->aq_fw_ops->deinit) && link_down) { mutex_lock(&self->fwreq_mutex); self->aq_fw_ops->deinit(self->aq_hw); mutex_unlock(&self->fwreq_mutex); -- GitLab From a1300691aed9ee852b0a9192e29e2bdc2411a7e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 3 Feb 2025 17:08:38 +0000 Subject: [PATCH 238/989] net: rose: lock the socket in rose_bind() syzbot reported a soft lockup in rose_loopback_timer(), with a repro calling bind() from multiple threads. rose_bind() must lock the socket to avoid this issue. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+7ff41b5215f0c534534e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67a0f78d.050a0220.d7c5a.00a0.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250203170838.3521361-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/rose/af_rose.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 72c65d938a150..a4a668b88a8f2 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -701,11 +701,9 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; + int err = -EINVAL; int n; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; - if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; @@ -718,8 +716,15 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; - if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) - return -EADDRNOTAVAIL; + lock_sock(sk); + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_release; + + err = -EADDRNOTAVAIL; + dev = rose_dev_get(&addr->srose_addr); + if (!dev) + goto out_release; source = &addr->srose_call; @@ -730,7 +735,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); - return -EACCES; + err = -EACCES; + goto out_release; } rose->source_call = *source; } @@ -753,8 +759,10 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); - - return 0; + err = 0; +out_release: + release_sock(sk); + return err; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) -- GitLab From 820ccf8cb2b145ab9fc12651f7f80339614fa46c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 31 Jan 2025 15:31:19 -0700 Subject: [PATCH 239/989] drm/amd/display: Respect user's CONFIG_FRAME_WARN more for dml files Currently, there are several files in drm/amd/display that aim to have a higher -Wframe-larger-than value to avoid instances of that warning with a lower value from the user's configuration. However, with the way that it is currently implemented, it does not respect the user's request via CONFIG_FRAME_WARN for a higher stack frame limit, which can cause pain when new instances of the warning appear and break the build due to CONFIG_WERROR. Adjust the logic to switch from a hard coded -Wframe-larger-than value to only using the value as a minimum clamp and deferring to the requested value from CONFIG_FRAME_WARN if it is higher. Suggested-by: Harry Wentland Reported-by: Greg Kroah-Hartman Closes: https://lore.kernel.org/2025013003-audience-opposing-7f95@gregkh/ Signed-off-by: Nathan Chancellor Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/Makefile | 14 ++++++++----- drivers/gpu/drm/amd/display/dc/dml2/Makefile | 22 ++++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index 46f9c05de16e8..e1d500633dfad 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -29,11 +29,15 @@ dml_ccflags := $(CC_FLAGS_FPU) dml_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) -ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) -frame_warn_flag := -Wframe-larger-than=3072 -else -frame_warn_flag := -Wframe-larger-than=2048 -endif + ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) + frame_warn_limit := 3072 + else + frame_warn_limit := 2048 + endif + + ifeq ($(call test-lt, $(CONFIG_FRAME_WARN), $(frame_warn_limit)),y) + frame_warn_flag := -Wframe-larger-than=$(frame_warn_limit) + endif endif CFLAGS_$(AMDDALPATH)/dc/dml/display_mode_lib.o := $(dml_ccflags) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/Makefile b/drivers/gpu/drm/amd/display/dc/dml2/Makefile index 91c4f3b4bd5f4..21fd466dba26e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml2/Makefile @@ -28,15 +28,19 @@ dml2_ccflags := $(CC_FLAGS_FPU) dml2_rcflags := $(CC_FLAGS_NO_FPU) ifneq ($(CONFIG_FRAME_WARN),0) -ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) -ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_COMPILE_TEST),yy) -frame_warn_flag := -Wframe-larger-than=4096 -else -frame_warn_flag := -Wframe-larger-than=3072 -endif -else -frame_warn_flag := -Wframe-larger-than=2048 -endif + ifeq ($(filter y,$(CONFIG_KASAN)$(CONFIG_KCSAN)),y) + ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_COMPILE_TEST),yy) + frame_warn_limit := 4096 + else + frame_warn_limit := 3072 + endif + else + frame_warn_limit := 2048 + endif + + ifeq ($(call test-lt, $(CONFIG_FRAME_WARN), $(frame_warn_limit)),y) + frame_warn_flag := -Wframe-larger-than=$(frame_warn_limit) + endif endif subdir-ccflags-y += -I$(FULL_AMD_DISPLAY_PATH)/dc/dml2 -- GitLab From f245b400a223a71d6d5f4c72a2cb9b573a7fc2b6 Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Tue, 4 Feb 2025 15:07:44 +0800 Subject: [PATCH 240/989] Revert "drm/amd/display: Use HW lock mgr for PSR1" This reverts commit a2b5a9956269 ("drm/amd/display: Use HW lock mgr for PSR1") Because it may cause system hang while connect with two edp panel. Acked-by: Wayne Lin Signed-off-by: Tom Chung Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index 5bb8b78bf250a..bf636b28e3e16 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -63,8 +63,7 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, bool should_use_dmub_lock(struct dc_link *link) { - if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1 || - link->psr_settings.psr_version == DC_PSR_VERSION_1) + if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) return true; if (link->replay_settings.replay_feature_enabled) -- GitLab From ceb5faef848b2fbb5d1e99617093cc9d4deb2b30 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Fri, 24 Jan 2025 01:07:44 +0530 Subject: [PATCH 241/989] integrity: fix typos and spelling errors Fix typos and spelling errors in integrity module comments that were identified using the codespell tool. No functional changes - documentation only. Signed-off-by: Tanya Agarwal Reviewed-by: Mimi Zohar Signed-off-by: Mimi Zohar --- security/integrity/evm/evm_crypto.c | 2 +- security/integrity/evm/evm_main.c | 2 +- security/integrity/ima/ima_main.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 7c06ffd633d24..a5e730ffda57f 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -180,7 +180,7 @@ static void hmac_add_misc(struct shash_desc *desc, struct inode *inode, } /* - * Dump large security xattr values as a continuous ascii hexademical string. + * Dump large security xattr values as a continuous ascii hexadecimal string. * (pr_debug is limited to 64 bytes.) */ static void dump_security_xattr_l(const char *prefix, const void *src, diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 377e57e9084f0..0add782e73ba2 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -169,7 +169,7 @@ static int is_unsupported_hmac_fs(struct dentry *dentry) * and compare it against the stored security.evm xattr. * * For performance: - * - use the previoulsy retrieved xattr value and length to calculate the + * - use the previously retrieved xattr value and length to calculate the * HMAC.) * - cache the verification result in the iint, when available. * diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 9f9897a7c217e..f2c9affa0c2ac 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -1011,9 +1011,9 @@ int process_buffer_measurement(struct mnt_idmap *idmap, } /* - * Both LSM hooks and auxilary based buffer measurements are - * based on policy. To avoid code duplication, differentiate - * between the LSM hooks and auxilary buffer measurements, + * Both LSM hooks and auxiliary based buffer measurements are + * based on policy. To avoid code duplication, differentiate + * between the LSM hooks and auxiliary buffer measurements, * retrieving the policy rule information only for the LSM hook * buffer measurements. */ -- GitLab From 57a0ef02fefafc4b9603e33a18b669ba5ce59ba3 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 4 Feb 2025 13:57:20 +0100 Subject: [PATCH 242/989] ima: Reset IMA_NONACTION_RULE_FLAGS after post_setattr Commit 0d73a55208e9 ("ima: re-introduce own integrity cache lock") mistakenly reverted the performance improvement introduced in commit 42a4c603198f0 ("ima: fix ima_inode_post_setattr"). The unused bit mask was subsequently removed by commit 11c60f23ed13 ("integrity: Remove unused macro IMA_ACTION_RULE_FLAGS"). Restore the performance improvement by introducing the new mask IMA_NONACTION_RULE_FLAGS, equal to IMA_NONACTION_FLAGS without IMA_NEW_FILE, which is not a rule-specific flag. Finally, reset IMA_NONACTION_RULE_FLAGS instead of IMA_NONACTION_FLAGS in process_measurement(), if the IMA_CHANGE_ATTR atomic flag is set (after file metadata modification). With this patch, new files for which metadata were modified while they are still open, can be reopened before the last file close (when security.ima is written), since the IMA_NEW_FILE flag is not cleared anymore. Otherwise, appraisal fails because security.ima is missing (files with IMA_NEW_FILE set are an exception). Cc: stable@vger.kernel.org # v4.16.x Fixes: 0d73a55208e9 ("ima: re-introduce own integrity cache lock") Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 3 +++ security/integrity/ima/ima_main.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 24d09ea91b877..a4f284bd846c1 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -149,6 +149,9 @@ struct ima_kexec_hdr { #define IMA_CHECK_BLACKLIST 0x40000000 #define IMA_VERITY_REQUIRED 0x80000000 +/* Exclude non-action flags which are not rule-specific. */ +#define IMA_NONACTION_RULE_FLAGS (IMA_NONACTION_FLAGS & ~IMA_NEW_FILE) + #define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \ IMA_HASH | IMA_APPRAISE_SUBMASK) #define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \ diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f2c9affa0c2ac..28b8b0db6f9bb 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -269,10 +269,13 @@ static int process_measurement(struct file *file, const struct cred *cred, mutex_lock(&iint->mutex); if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags)) - /* reset appraisal flags if ima_inode_post_setattr was called */ + /* + * Reset appraisal flags (action and non-action rule-specific) + * if ima_inode_post_setattr was called. + */ iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED | IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK | - IMA_NONACTION_FLAGS); + IMA_NONACTION_RULE_FLAGS); /* * Re-evaulate the file if either the xattr has changed or the -- GitLab From 32ffed055dcee17f6705f545b069e44a66067808 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Wed, 5 Feb 2025 00:43:43 +0000 Subject: [PATCH 243/989] regmap-irq: Add missing kfree() Add kfree() for "d->main_status_buf" to the error-handling path to prevent a memory leak. Fixes: a2d21848d921 ("regmap: regmap-irq: Add main status register support") Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Jiasheng Jiang Link: https://patch.msgid.link/20250205004343.14413-1-jiashengjiangcool@gmail.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index 0bcd81389a29f..978613407ea3c 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -906,6 +906,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_buf); kfree(d->status_reg_buf); if (d->config_buf) { @@ -981,6 +982,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); + kfree(d->main_status_buf); kfree(d->status_reg_buf); kfree(d->status_buf); if (d->config_buf) { -- GitLab From b0eddc21900fb44f8c5db95710479865e3700fbd Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Wed, 5 Feb 2025 13:16:56 +0530 Subject: [PATCH 244/989] regulator: qcom_smd: Add l2, l5 sub-node to mp5496 regulator Adding l2, l5 sub-node entry to mp5496 regulator node. Cc: stable@vger.kernel.org Acked-by: Rob Herring Signed-off-by: Varadarajan Narayanan Link: https://patch.msgid.link/20250205074657.4142365-2-quic_varada@quicinc.com Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml index f2fd2df68a9ed..b7241ce975b96 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/qcom,smd-rpm-regulator.yaml @@ -22,7 +22,7 @@ description: Each sub-node is identified using the node's name, with valid values listed for each of the pmics below. - For mp5496, s1, s2 + For mp5496, s1, s2, l2, l5 For pm2250, s1, s2, s3, s4, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15, l16, l17, l18, l19, l20, l21, l22 -- GitLab From c4d3dfd8ccaef2cbd374860e307f1e056854a472 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 5 Feb 2025 14:21:36 +0100 Subject: [PATCH 245/989] Revert "i2c: Replace list-based mechanism for handling userspace-created clients" This reverts commit 3cfe39b3a845593a485ab1c716615979004ef9f6. Mux handling is not sufficiently implemented. It needs more time. Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 61 ++++++++++++++++++++++++------------- include/linux/i2c.h | 7 ++++- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 5546184df05f9..ddac2f1557180 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1300,12 +1300,14 @@ new_device_store(struct device *dev, struct device_attribute *attr, info.flags |= I2C_CLIENT_SLAVE; } - info.flags |= I2C_CLIENT_USER; - client = i2c_new_client_device(adap, &info); if (IS_ERR(client)) return PTR_ERR(client); + /* Keep track of the added device */ + mutex_lock(&adap->userspace_clients_lock); + list_add_tail(&client->detected, &adap->userspace_clients); + mutex_unlock(&adap->userspace_clients_lock); dev_info(dev, "%s: Instantiated device %s at 0x%02hx\n", "new_device", info.type, info.addr); @@ -1313,15 +1315,6 @@ new_device_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_WO(new_device); -static int __i2c_find_user_addr(struct device *dev, const void *addrp) -{ - struct i2c_client *client = i2c_verify_client(dev); - unsigned short addr = *(unsigned short *)addrp; - - return client && client->flags & I2C_CLIENT_USER && - i2c_encode_flags_to_addr(client) == addr; -} - /* * And of course let the users delete the devices they instantiated, if * they got it wrong. This interface can only be used to delete devices @@ -1336,7 +1329,7 @@ delete_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct i2c_adapter *adap = to_i2c_adapter(dev); - struct device *child_dev; + struct i2c_client *client, *next; unsigned short addr; char end; int res; @@ -1352,19 +1345,28 @@ delete_device_store(struct device *dev, struct device_attribute *attr, return -EINVAL; } - mutex_lock(&core_lock); /* Make sure the device was added through sysfs */ - child_dev = device_find_child(&adap->dev, &addr, __i2c_find_user_addr); - if (child_dev) { - i2c_unregister_device(i2c_verify_client(child_dev)); - put_device(child_dev); - } else { - dev_err(dev, "Can't find userspace-created device at %#x\n", addr); - count = -ENOENT; + res = -ENOENT; + mutex_lock_nested(&adap->userspace_clients_lock, + i2c_adapter_depth(adap)); + list_for_each_entry_safe(client, next, &adap->userspace_clients, + detected) { + if (i2c_encode_flags_to_addr(client) == addr) { + dev_info(dev, "%s: Deleting device %s at 0x%02hx\n", + "delete_device", client->name, client->addr); + + list_del(&client->detected); + i2c_unregister_device(client); + res = count; + break; + } } - mutex_unlock(&core_lock); + mutex_unlock(&adap->userspace_clients_lock); - return count; + if (res < 0) + dev_err(dev, "%s: Can't find device in list\n", + "delete_device"); + return res; } static DEVICE_ATTR_IGNORE_LOCKDEP(delete_device, S_IWUSR, NULL, delete_device_store); @@ -1535,6 +1537,8 @@ static int i2c_register_adapter(struct i2c_adapter *adap) adap->locked_flags = 0; rt_mutex_init(&adap->bus_lock); rt_mutex_init(&adap->mux_lock); + mutex_init(&adap->userspace_clients_lock); + INIT_LIST_HEAD(&adap->userspace_clients); /* Set default timeout to 1 second if not already set */ if (adap->timeout == 0) @@ -1726,6 +1730,7 @@ static int __unregister_dummy(struct device *dev, void *dummy) void i2c_del_adapter(struct i2c_adapter *adap) { struct i2c_adapter *found; + struct i2c_client *client, *next; /* First make sure that this adapter was ever added */ mutex_lock(&core_lock); @@ -1738,6 +1743,18 @@ void i2c_del_adapter(struct i2c_adapter *adap) i2c_acpi_remove_space_handler(adap); + /* Remove devices instantiated from sysfs */ + mutex_lock_nested(&adap->userspace_clients_lock, + i2c_adapter_depth(adap)); + list_for_each_entry_safe(client, next, &adap->userspace_clients, + detected) { + dev_dbg(&adap->dev, "Removing %s at 0x%x\n", client->name, + client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + mutex_unlock(&adap->userspace_clients_lock); + /* Detach any active clients. This can't fail, thus we do not * check the returned value. This is a two-pass process, because * we can't remove the dummy devices during the first pass: they diff --git a/include/linux/i2c.h b/include/linux/i2c.h index c31fd1dba3bd2..4955d9e76c5fb 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -313,6 +313,8 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) + * @detected: member of an i2c_driver.clients list or i2c-core's + * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources @@ -333,7 +335,6 @@ struct i2c_client { #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ -#define I2C_CLIENT_USER 0x200 /* client was userspace-created */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ @@ -345,6 +346,7 @@ struct i2c_client { struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ + struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif @@ -751,6 +753,9 @@ struct i2c_adapter { char name[48]; struct completion dev_released; + struct mutex userspace_clients_lock; + struct list_head userspace_clients; + struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; -- GitLab From 3bfa08fe9ec8dd79e183c88e1275be74191e7bc8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 5 Feb 2025 14:22:12 +0100 Subject: [PATCH 246/989] Revert "i2c: Replace list-based mechanism for handling auto-detected clients" This reverts commit 56a50667cbcfaf95eea9128d5676af94e54b51a8. Mux handling is not sufficiently implemented. It needs more time. Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 52 +++++++++++++++++++++++++------------ include/linux/i2c.h | 3 ++- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index ddac2f1557180..35a221e2c11c1 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -1704,6 +1704,23 @@ int i2c_add_numbered_adapter(struct i2c_adapter *adap) } EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter); +static void i2c_do_del_adapter(struct i2c_driver *driver, + struct i2c_adapter *adapter) +{ + struct i2c_client *client, *_n; + + /* Remove the devices we created ourselves as the result of hardware + * probing (using a driver's detect method) */ + list_for_each_entry_safe(client, _n, &driver->clients, detected) { + if (client->adapter == adapter) { + dev_dbg(&adapter->dev, "Removing %s at 0x%x\n", + client->name, client->addr); + list_del(&client->detected); + i2c_unregister_device(client); + } + } +} + static int __unregister_client(struct device *dev, void *dummy) { struct i2c_client *client = i2c_verify_client(dev); @@ -1719,6 +1736,12 @@ static int __unregister_dummy(struct device *dev, void *dummy) return 0; } +static int __process_removed_adapter(struct device_driver *d, void *data) +{ + i2c_do_del_adapter(to_i2c_driver(d), data); + return 0; +} + /** * i2c_del_adapter - unregister I2C adapter * @adap: the adapter being unregistered @@ -1742,6 +1765,11 @@ void i2c_del_adapter(struct i2c_adapter *adap) } i2c_acpi_remove_space_handler(adap); + /* Tell drivers about this removal */ + mutex_lock(&core_lock); + bus_for_each_drv(&i2c_bus_type, NULL, adap, + __process_removed_adapter); + mutex_unlock(&core_lock); /* Remove devices instantiated from sysfs */ mutex_lock_nested(&adap->userspace_clients_lock, @@ -1760,10 +1788,8 @@ void i2c_del_adapter(struct i2c_adapter *adap) * we can't remove the dummy devices during the first pass: they * could have been instantiated by real devices wishing to clean * them up properly, so we give them a chance to do that first. */ - mutex_lock(&core_lock); device_for_each_child(&adap->dev, NULL, __unregister_client); device_for_each_child(&adap->dev, NULL, __unregister_dummy); - mutex_unlock(&core_lock); /* device name is gone after device_unregister */ dev_dbg(&adap->dev, "adapter [%s] unregistered\n", adap->name); @@ -1983,6 +2009,7 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) /* add the driver to the list of i2c drivers in the driver core */ driver->driver.owner = owner; driver->driver.bus = &i2c_bus_type; + INIT_LIST_HEAD(&driver->clients); /* When registration returns, the driver core * will have called probe() for all matching-but-unbound devices. @@ -2000,13 +2027,10 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver) } EXPORT_SYMBOL(i2c_register_driver); -static int __i2c_unregister_detected_client(struct device *dev, void *argp) +static int __process_removed_driver(struct device *dev, void *data) { - struct i2c_client *client = i2c_verify_client(dev); - - if (client && client->flags & I2C_CLIENT_AUTO) - i2c_unregister_device(client); - + if (dev->type == &i2c_adapter_type) + i2c_do_del_adapter(data, to_i2c_adapter(dev)); return 0; } @@ -2017,12 +2041,7 @@ static int __i2c_unregister_detected_client(struct device *dev, void *argp) */ void i2c_del_driver(struct i2c_driver *driver) { - mutex_lock(&core_lock); - /* Satisfy __must_check, function can't fail */ - if (driver_for_each_device(&driver->driver, NULL, NULL, - __i2c_unregister_detected_client)) { - } - mutex_unlock(&core_lock); + i2c_for_each_dev(driver, __process_removed_driver); driver_unregister(&driver->driver); pr_debug("driver [%s] unregistered\n", driver->driver.name); @@ -2449,7 +2468,6 @@ static int i2c_detect_address(struct i2c_client *temp_client, /* Finally call the custom detection function */ memset(&info, 0, sizeof(struct i2c_board_info)); info.addr = addr; - info.flags = I2C_CLIENT_AUTO; err = driver->detect(temp_client, &info); if (err) { /* -ENODEV is returned if the detection fails. We catch it @@ -2476,7 +2494,9 @@ static int i2c_detect_address(struct i2c_client *temp_client, dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n", info.type, info.addr); client = i2c_new_client_device(adapter, &info); - if (IS_ERR(client)) + if (!IS_ERR(client)) + list_add_tail(&client->detected, &driver->clients); + else dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n", info.type, info.addr); } diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 4955d9e76c5fb..2b2af24d2a436 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -244,6 +244,7 @@ enum i2c_driver_flags { * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) + * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. @@ -298,6 +299,7 @@ struct i2c_driver { /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; + struct list_head clients; u32 flags; }; @@ -334,7 +336,6 @@ struct i2c_client { #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ -#define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ -- GitLab From 015b7dae084fa95465ff89f6cbf15fe49906a370 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 3 Feb 2025 12:01:23 +0100 Subject: [PATCH 247/989] gpio: sim: lock hog configfs items if present Depending on the user config, the leaf entry may be the hog directory, not line. Check it and lock the correct item. Fixes: 8bd76b3d3f3a ("gpio: sim: lock up configfs that an instantiated device depends on") Tested-by: Koichiro Den Link: https://lore.kernel.org/r/20250203110123.87701-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-sim.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c index a086087ada177..b6c230fab8404 100644 --- a/drivers/gpio/gpio-sim.c +++ b/drivers/gpio/gpio-sim.c @@ -1028,20 +1028,23 @@ gpio_sim_device_lockup_configfs(struct gpio_sim_device *dev, bool lock) struct configfs_subsystem *subsys = dev->group.cg_subsys; struct gpio_sim_bank *bank; struct gpio_sim_line *line; + struct config_item *item; /* - * The device only needs to depend on leaf line entries. This is + * The device only needs to depend on leaf entries. This is * sufficient to lock up all the configfs entries that the * instantiated, alive device depends on. */ list_for_each_entry(bank, &dev->bank_list, siblings) { list_for_each_entry(line, &bank->line_list, siblings) { + item = line->hog ? &line->hog->item + : &line->group.cg_item; + if (lock) - WARN_ON(configfs_depend_item_unlocked( - subsys, &line->group.cg_item)); + WARN_ON(configfs_depend_item_unlocked(subsys, + item)); else - configfs_undepend_item_unlocked( - &line->group.cg_item); + configfs_undepend_item_unlocked(item); } } } -- GitLab From 5393f40a640b8c4f716bf87e7b0d4328bf1f22b2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Feb 2025 14:05:03 +0100 Subject: [PATCH 248/989] gpio: GPIO_GRGPIO should depend on OF While the Aeroflex Gaisler GRGPIO driver has no build-time dependency on gpiolib-of, it supports only DT-based configuration, and is used only on DT systems. Hence add a dependency on OF, to prevent asking the user about this driver when configuring a kernel without DT support. Fixes: bc40668def384256 ("gpio: grgpio: drop Kconfig dependency on OF_GPIO") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andreas Larsson Link: https://lore.kernel.org/r/db6da3d11bf850d89f199e5c740d8f133e38078d.1738760539.git.geert+renesas@glider.be Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index add5ad29a673c..98b4d1633b258 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -338,6 +338,7 @@ config GPIO_GRANITERAPIDS config GPIO_GRGPIO tristate "Aeroflex Gaisler GRGPIO support" + depends on OF || COMPILE_TEST select GPIO_GENERIC select IRQ_DOMAIN help -- GitLab From 59ff2040f0a58923c787fdba5999100667338230 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 4 Feb 2025 13:45:15 +0200 Subject: [PATCH 249/989] MAINTAINERS: Use my kernel.org address for ACPI GPIO work Switch to use my kernel.org address for ACPI GPIO work. Signed-off-by: Mika Westerberg Link: https://lore.kernel.org/r/20250204114515.3971923-1-mika.westerberg@linux.intel.com Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa0654..d1389ca6699df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9878,7 +9878,7 @@ S: Maintained F: drivers/staging/gpib/ GPIO ACPI SUPPORT -M: Mika Westerberg +M: Mika Westerberg M: Andy Shevchenko L: linux-gpio@vger.kernel.org L: linux-acpi@vger.kernel.org -- GitLab From 738fc998b639407346a9e026514f0562301462cd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 31 Jan 2025 15:55:28 -0700 Subject: [PATCH 250/989] scripts/Makefile.extrawarn: Do not show clang's non-kprintf warnings at W=1 Clang's -Wformat-overflow and -Wformat-truncation have chosen to check '%p' unlike GCC but it does not know about the kernel's pointer extensions in lib/vsprintf.c, so the developers split that part of the warning out for the kernel to disable because there will always be false positives. Commit 908dd508276d ("kbuild: enable -Wformat-truncation on clang") did disabled these warnings but only in a block that would be called when W=1 was not passed, so they would appear with W=1. Move the disabling of the non-kprintf warnings to a block that always runs so that they are never seen, regardless of warning level. Fixes: 908dd508276d ("kbuild: enable -Wformat-truncation on clang") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501291646.VtwF98qd-lkp@intel.com/ Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/Makefile.extrawarn | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index eb719f6d8d536..e976790f84dc6 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -31,6 +31,11 @@ KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds ifdef CONFIG_CC_IS_CLANG # The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. KBUILD_CFLAGS += -Wno-gnu + +# Clang checks for overflow/truncation with '%p', while GCC does not: +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 +KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf) +KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf) else # gcc inanely warns about local variables called 'main' @@ -105,11 +110,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, packed-not-aligned) KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation) -else -# Clang checks for overflow/truncation with '%p', while GCC does not: -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 -KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf) -KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf) endif KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) -- GitLab From 4c56eb33e603c3b9eb4bd24efbfdd0283c1c37e4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2025 03:51:41 +0900 Subject: [PATCH 251/989] kbuild: keep symbols for symbol_get() even with CONFIG_TRIM_UNUSED_KSYMS Linus observed that the symbol_request(utf8_data_table) call fails when CONFIG_UNICODE=y and CONFIG_TRIM_UNUSED_KSYMS=y. symbol_get() relies on the symbol data being present in the ksymtab for symbol lookups. However, EXPORT_SYMBOL_GPL(utf8_data_table) is dropped due to CONFIG_TRIM_UNUSED_KSYMS, as no module references it in this case. Probably, this has been broken since commit dbacb0ef670d ("kconfig option for TRIM_UNUSED_KSYMS"). This commit addresses the issue by leveraging modpost. Symbol names passed to symbol_get() are recorded in the special .no_trim_symbol section, which is then parsed by modpost to forcibly keep such symbols. The .no_trim_symbol section is discarded by the linker scripts, so there is no impact on the size of the final vmlinux or modules. This commit cannot resolve the issue for direct calls to __symbol_get() because the symbol name is not known at compile-time. Although symbol_get() may eventually be deprecated, this workaround should be good enough meanwhile. Reported-by: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Masahiro Yamada --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/module.h | 5 ++++- scripts/mod/modpost.c | 35 +++++++++++++++++++++++++++++++ scripts/mod/modpost.h | 6 ++++++ scripts/module.lds.S | 1 + 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 54504013c7491..02a4adb4a9999 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1038,6 +1038,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.discard) \ *(.discard.*) \ *(.export_symbol) \ + *(.no_trim_symbol) \ *(.modinfo) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ diff --git a/include/linux/module.h b/include/linux/module.h index 23792d5d7b74b..30e5b19bafa98 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -306,7 +306,10 @@ extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) +#define symbol_get(x) ({ \ + static const char __notrim[] \ + __used __section(".no_trim_symbol") = __stringify(x); \ + (typeof(&x))(__symbol_get(__stringify(x))); }) /* modules using other modules: kdb wants to see this. */ struct module_use { diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index e18ae7dc8140a..36b28987a2f07 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -507,6 +507,9 @@ static int parse_elf(struct elf_info *info, const char *filename) info->modinfo_len = sechdrs[i].sh_size; } else if (!strcmp(secname, ".export_symbol")) { info->export_symbol_secndx = i; + } else if (!strcmp(secname, ".no_trim_symbol")) { + info->no_trim_symbol = (void *)hdr + sechdrs[i].sh_offset; + info->no_trim_symbol_len = sechdrs[i].sh_size; } if (sechdrs[i].sh_type == SHT_SYMTAB) { @@ -1566,6 +1569,14 @@ static void read_symbols(const char *modname) /* strip trailing .o */ mod = new_module(modname, strlen(modname) - strlen(".o")); + /* save .no_trim_symbol section for later use */ + if (info.no_trim_symbol_len) { + mod->no_trim_symbol = xmalloc(info.no_trim_symbol_len); + memcpy(mod->no_trim_symbol, info.no_trim_symbol, + info.no_trim_symbol_len); + mod->no_trim_symbol_len = info.no_trim_symbol_len; + } + if (!mod->is_vmlinux) { license = get_modinfo(&info, "license"); if (!license) @@ -1728,6 +1739,28 @@ static void handle_white_list_exports(const char *white_list) free(buf); } +/* + * Keep symbols recorded in the .no_trim_symbol section. This is necessary to + * prevent CONFIG_TRIM_UNUSED_KSYMS from dropping EXPORT_SYMBOL because + * symbol_get() relies on the symbol being present in the ksymtab for lookups. + */ +static void keep_no_trim_symbols(struct module *mod) +{ + unsigned long size = mod->no_trim_symbol_len; + + for (char *s = mod->no_trim_symbol; s; s = next_string(s , &size)) { + struct symbol *sym; + + /* + * If find_symbol() returns NULL, this symbol is not provided + * by any module, and symbol_get() will fail. + */ + sym = find_symbol(s); + if (sym) + sym->used = true; + } +} + static void check_modname_len(struct module *mod) { const char *mod_name; @@ -2254,6 +2287,8 @@ int main(int argc, char **argv) read_symbols_from_files(files_source); list_for_each_entry(mod, &modules, list) { + keep_no_trim_symbols(mod); + if (mod->dump_file || mod->is_vmlinux) continue; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index ffd0a52a606ef..59366f456b765 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -111,6 +111,8 @@ struct module_alias { * * @dump_file: path to the .symvers file if loaded from a file * @aliases: list head for module_aliases + * @no_trim_symbol: .no_trim_symbol section data + * @no_trim_symbol_len: length of the .no_trim_symbol section */ struct module { struct list_head list; @@ -128,6 +130,8 @@ struct module { // Actual imported namespaces struct list_head imported_namespaces; struct list_head aliases; + char *no_trim_symbol; + unsigned int no_trim_symbol_len; char name[]; }; @@ -141,6 +145,8 @@ struct elf_info { char *strtab; char *modinfo; unsigned int modinfo_len; + char *no_trim_symbol; + unsigned int no_trim_symbol_len; /* support for 32bit section numbers */ diff --git a/scripts/module.lds.S b/scripts/module.lds.S index c2f80f9141d40..450f1088d5fd3 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -16,6 +16,7 @@ SECTIONS { *(.discard) *(.discard.*) *(.export_symbol) + *(.no_trim_symbol) } __ksymtab 0 : ALIGN(8) { *(SORT(___ksymtab+*)) } -- GitLab From 98a5cfd2320966f40fe049a9855f8787f0126825 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 5 Feb 2025 09:43:31 +0100 Subject: [PATCH 252/989] x86/xen: fix xen_hypercall_hvm() to not clobber %rbx xen_hypercall_hvm(), which is used when running as a Xen PVH guest at most only once during early boot, is clobbering %rbx. Depending on whether the caller relies on %rbx to be preserved across the call or not, this clobbering might result in an early crash of the system. This can be avoided by using an already saved register instead of %rbx. Fixes: b4845bb63838 ("x86/xen: add central hypercall functions") Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Reviewed-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/xen/xen-head.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 9252652afe596..72f28d66e0e52 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -117,8 +117,8 @@ SYM_FUNC_START(xen_hypercall_hvm) pop %ebx pop %eax #else - lea xen_hypercall_amd(%rip), %rbx - cmp %rax, %rbx + lea xen_hypercall_amd(%rip), %rcx + cmp %rax, %rcx #ifdef CONFIG_FRAME_POINTER pop %rax /* Dummy pop. */ #endif -- GitLab From 0bd797b801bd8ee06c822844e20d73aaea0878dd Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 5 Feb 2025 10:07:56 +0100 Subject: [PATCH 253/989] x86/xen: add FRAME_END to xen_hypercall_hvm() xen_hypercall_hvm() is missing a FRAME_END at the end, add it. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502030848.HTNTTuo9-lkp@intel.com/ Fixes: b4845bb63838 ("x86/xen: add central hypercall functions") Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Reviewed-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/xen/xen-head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 72f28d66e0e52..4e481b0eefc96 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -132,6 +132,7 @@ SYM_FUNC_START(xen_hypercall_hvm) pop %rcx pop %rax #endif + FRAME_END /* Use correct hypercall function. */ jz xen_hypercall_amd jmp xen_hypercall_intel -- GitLab From aaf5eefd374b6e006e1c224a2b37bac9d3737aa2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 5 Feb 2025 11:24:47 +0100 Subject: [PATCH 254/989] x86/xen: remove unneeded dummy push from xen_hypercall_hvm() Stack alignment of the kernel in 64-bit mode is 8, not 16, so the dummy push in xen_hypercall_hvm() for aligning the stack to 16 bytes can be removed. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Reviewed-by: Andrew Cooper Signed-off-by: Juergen Gross --- arch/x86/xen/xen-head.S | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 4e481b0eefc96..894edf8d6d62f 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -100,9 +100,6 @@ SYM_FUNC_START(xen_hypercall_hvm) push %r10 push %r9 push %r8 -#ifdef CONFIG_FRAME_POINTER - pushq $0 /* Dummy push for stack alignment. */ -#endif #endif /* Set the vendor specific function. */ call __xen_hypercall_setfunc @@ -119,9 +116,6 @@ SYM_FUNC_START(xen_hypercall_hvm) #else lea xen_hypercall_amd(%rip), %rcx cmp %rax, %rcx -#ifdef CONFIG_FRAME_POINTER - pop %rax /* Dummy pop. */ -#endif pop %r8 pop %r9 pop %r10 -- GitLab From d364eee14c682b141f4667efc3c65191339d88bd Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:12 +0000 Subject: [PATCH 255/989] cpufreq/amd-pstate: Remove the goto label in amd_pstate_update_limits Scope based guard/cleanup macros should not be used together with goto labels. Hence, remove the goto label. Fixes: 6c093d5a5b73 ("cpufreq/amd-pstate: convert mutex use to guard()") Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250205112523.201101-2-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 7120f035c0be4..b163c16998218 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -838,8 +838,10 @@ static void amd_pstate_update_limits(unsigned int cpu) guard(mutex)(&amd_pstate_driver_lock); ret = amd_get_highest_perf(cpu, &cur_high); - if (ret) - goto free_cpufreq_put; + if (ret) { + cpufreq_cpu_put(policy); + return; + } prev_high = READ_ONCE(cpudata->prefcore_ranking); highest_perf_changed = (prev_high != cur_high); @@ -849,8 +851,6 @@ static void amd_pstate_update_limits(unsigned int cpu) if (cur_high < CPPC_MAX_PERF) sched_set_itmt_core_prio((int)cur_high, cpu); } - -free_cpufreq_put: cpufreq_cpu_put(policy); if (!highest_perf_changed) -- GitLab From 55db9b73c3a77544efc671d5e796d9674772c330 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:13 +0000 Subject: [PATCH 256/989] cpufreq/amd-pstate: Fix max_perf updation with schedutil In adjust_perf() callback, we are setting the max_perf to highest_perf, as opposed to the correct limit value i.e. max_limit_perf. Fix that. Fixes: 3f7b835fa4d0 ("cpufreq/amd-pstate: Move limit updating code") Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250205112523.201101-3-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index b163c16998218..9dc3933bc3261 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -699,7 +699,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (min_perf < lowest_nonlinear_perf) min_perf = lowest_nonlinear_perf; - max_perf = cap_perf; + max_perf = cpudata->max_limit_perf; if (max_perf < min_perf) max_perf = min_perf; -- GitLab From 038e33fcd40e59b60cdca561c2a39998e6759e08 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Thu, 9 Jan 2025 16:41:49 +0100 Subject: [PATCH 257/989] dt-bindings: display: Add powertip,{st7272|hx8238a} as DT Schema description This patch provides the DT Schema description of: - powertip,st7272 320 x 240 LCD display - powertip,hx8238a 320 x 240 LCD display Used with the different HW revisions of btt3 devices. Signed-off-by: Lukasz Majewski Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20250109154149.1212631-1-lukma@denx.de Signed-off-by: Rob Herring (Arm) --- .../display/panel/powertip,hx8238a.yaml | 29 +++++++++++++++++++ .../display/panel/powertip,st7272.yaml | 29 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml create mode 100644 Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml diff --git a/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml b/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml new file mode 100644 index 0000000000000..b7d74faeb5d50 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/powertip,hx8238a.yaml @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/display/panel/powertip,hx8238a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Powertip Electronic Technology Co. 320 x 240 LCD panel + +maintainers: + - Lukasz Majewski + +allOf: + - $ref: panel-dpi.yaml# + +properties: + compatible: + items: + - const: powertip,hx8238a + - {} # panel-dpi, but not listed here to avoid false select + + height-mm: true + panel-timing: true + port: true + power-supply: true + width-mm: true + +additionalProperties: false + +... diff --git a/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml b/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml new file mode 100644 index 0000000000000..f3622800f13f6 --- /dev/null +++ b/Documentation/devicetree/bindings/display/panel/powertip,st7272.yaml @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/display/panel/powertip,st7272.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Powertip Electronic Technology Co. 320 x 240 LCD panel + +maintainers: + - Lukasz Majewski + +allOf: + - $ref: panel-dpi.yaml# + +properties: + compatible: + items: + - const: powertip,st7272 + - {} # panel-dpi, but not listed here to avoid false select + + height-mm: true + panel-timing: true + port: true + power-supply: true + width-mm: true + +additionalProperties: false + +... -- GitLab From 069504f1fcfa1532e4e221290df428b15bd9d284 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 4 Feb 2025 17:49:25 +0200 Subject: [PATCH 258/989] drm/i915/dp: Fix potential infinite loop in 128b/132b SST Passing 0 as the step only works when there are other reasons to break out of the BPP loop in intel_dp_mtp_tu_compute_config(). Otherwise, an infinite loop might occur. Fix it by explicitly checking for 0 step. Fixes: ef0a0757bbea ("drm/i915/dp: compute config for 128b/132b SST w/o DSC") Reported-by: Imre Deak Closes: https://lore.kernel.org/r/Z6I0knh2Kt5T0JrT@ideak-desk.fi.intel.com Reviewed-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250204154925.3001781-1-jani.nikula@intel.com Signed-off-by: Jani Nikula (cherry picked from commit a40e718d34d3d02c781c295466b013415f68c4f1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_dp_mst.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index 0c44fc7dd86ce..a65cf97ad12df 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -341,6 +341,10 @@ int intel_dp_mtp_tu_compute_config(struct intel_dp *intel_dp, break; } + + /* Allow using zero step to indicate one try */ + if (!step) + break; } if (slots < 0) { -- GitLab From 90508a1bb8f00618fa12cb2ad2276bc783656fc5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Feb 2025 16:24:15 +0530 Subject: [PATCH 259/989] cpufreq: airoha: modify CONFIG_OF dependency Compile-testing without CONFIG_OF leads to a harmless build warning: drivers/cpufreq/airoha-cpufreq.c:109:34: error: 'airoha_cpufreq_match_list' defined but not used [-Werror=unused-const-variable=] 109 | static const struct of_device_id airoha_cpufreq_match_list[] __initconst = { | ^~~~~~~~~~~~~~~~~~~~~~~~~ It would be possible to mark the variable as __maybe_unused to shut up that warning, but a Kconfig dependency seems more appropriate as this still allows build testing in allmodconfig and randconfig builds on all architectures. An earlier commit, b865a8404642 ("cpufreq: airoha: Depends on OF"), tried to fix it incorrectly. ARCH_AIROHA already requires CONFIG_OF, so this change does nothing, and the dependency is still missing for the COMPILE_TEST case. Fix it properly. Fixes: 84cf9e541ccc ("cpufreq: airoha: Add EN7581 CPUFreq SMCCC driver") Fixes: b865a8404642 ("cpufreq: airoha: Depends on OF") Signed-off-by: Arnd Bergmann [ Viresh: updated commit log and fixed rebase conflict ] Signed-off-by: Viresh Kumar Link: https://patch.msgid.link/9d51d2710061dfa7f2568287c6ed125b858b7318.1738580005.git.viresh.kumar@linaro.org Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/Kconfig.arm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 0ee5c691fb36b..9e46960f6a862 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -17,7 +17,8 @@ config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM config ARM_AIROHA_SOC_CPUFREQ tristate "Airoha EN7581 SoC CPUFreq support" - depends on (ARCH_AIROHA && OF) || COMPILE_TEST + depends on ARCH_AIROHA || COMPILE_TEST + depends on OF select PM_OPP default ARCH_AIROHA help -- GitLab From 0813fd2e14ca6ecd4e6ba005a9766f08e26020d7 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Wed, 5 Feb 2025 23:43:47 +0530 Subject: [PATCH 260/989] cpufreq: prevent NULL dereference in cpufreq_online() Ensure cpufreq_driver->set_boost is non-NULL before using it in cpufreq_online() to prevent a potential NULL pointer dereference. Reported-by: Gautam Menghani Closes: https://lore.kernel.org/all/c9e56c5f54cc33338762c94e9bed7b5a0d5de812.camel@linux.ibm.com/ Fixes: dd016f379ebc ("cpufreq: Introduce a more generic way to set default per-policy boost flag") Suggested-by: Viresh Kumar Signed-off-by: Aboorva Devarajan Link: https://patch.msgid.link/20250205181347.2079272-1-aboorvad@linux.ibm.com [ rjw: Minor edits in the subject and changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index e0048856eceee..30ffbddc7ecec 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1571,7 +1571,8 @@ static int cpufreq_online(unsigned int cpu) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (policy->boost_enabled != cpufreq_boost_enabled()) { + if (cpufreq_driver->set_boost && + policy->boost_enabled != cpufreq_boost_enabled()) { policy->boost_enabled = cpufreq_boost_enabled(); ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); if (ret) { -- GitLab From ab930483eca9f3e816c35824b5868599af0c61d7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 3 Feb 2025 21:46:29 +0200 Subject: [PATCH 261/989] ACPI: property: Fix return value for nval == 0 in acpi_data_prop_read() While analysing code for software and OF node for the corner case when caller asks to read zero items in the supposed to be an array of values I found that ACPI behaves differently to what OF does, i.e. 1. It returns -EINVAL when caller asks to read zero items from integer array, while OF returns 0, if no other errors happened. 2. It returns -EINVAL when caller asks to read zero items from string array, while OF returns -ENODATA, if no other errors happened. Amend ACPI implementation to follow what OF does. Fixes: b31384fa5de3 ("Driver core: Unified device properties interface for platform firmware") Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250203194629.3731895-1-andriy.shevchenko@linux.intel.com [ rjw: Added empty line after a conditional ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 98d93ed583150..436019d96027b 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1187,8 +1187,6 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, } break; } - if (nval == 0) - return -EINVAL; if (obj->type == ACPI_TYPE_BUFFER) { if (proptype != DEV_PROP_U8) @@ -1212,9 +1210,11 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, ret = acpi_copy_property_array_uint(items, (u64 *)val, nval); break; case DEV_PROP_STRING: - ret = acpi_copy_property_array_string( - items, (char **)val, - min_t(u32, nval, obj->package.count)); + nval = min_t(u32, nval, obj->package.count); + if (nval == 0) + return -ENODATA; + + ret = acpi_copy_property_array_string(items, (char **)val, nval); break; default: ret = -EINVAL; -- GitLab From 607ab6f85f4194b644ea95ac5fe660ef575db3b4 Mon Sep 17 00:00:00 2001 From: Gannon Kolding Date: Mon, 27 Jan 2025 02:39:02 -0700 Subject: [PATCH 262/989] ACPI: resource: IRQ override for Eluktronics MECH-17 The Eluktronics MECH-17 (GM7RG7N) needs IRQ overriding for the keyboard to work. Adding a DMI_MATCH entry for this laptop model makes the internal keyboard function normally. Signed-off-by: Gannon Kolding Link: https://patch.msgid.link/20250127093902.328361-1-gannon.kolding@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index 90aaec923889c..b4cd14e7fa76c 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -563,6 +563,12 @@ static const struct dmi_system_id irq1_edge_low_force_override[] = { DMI_MATCH(DMI_BOARD_NAME, "RP-15"), }, }, + { + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Eluktronics Inc."), + DMI_MATCH(DMI_BOARD_NAME, "MECH-17"), + }, + }, { /* TongFang GM6XGxX/TUXEDO Stellaris 16 Gen5 AMD */ .matches = { -- GitLab From 7f5704b6a143b8eca640cba820968e798d065e91 Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Sun, 26 Jan 2025 10:22:50 +0800 Subject: [PATCH 263/989] ACPI: PRM: Remove unnecessary strict handler address checks Commit 088984c8d54c ("ACPI: PRM: Find EFI_MEMORY_RUNTIME block for PRM handler and context") added unnecessary strict handler address checks, causing the PRM module to fail in translating memory error addresses. Both static data buffer address and ACPI parameter buffer address may be NULL if they are not needed, as described in section 4.1.2 PRM Handler Information Structure of Platform Runtime Mechanism specification [1]. Here are two examples from real hardware: ----PRMT.dsl---- - staic data address is not used [10Ch 0268 2] Revision : 0000 [10Eh 0270 2] Length : 002C [110h 0272 16] Handler GUID : F6A58D47-E04F-4F5A-86B8-2A50D4AA109B [120h 0288 8] Handler address : 0000000065CE51F4 [128h 0296 8] Satic Data Address : 0000000000000000 [130h 0304 8] ACPI Parameter Address : 000000006522A718 - ACPI parameter address is not used [1B0h 0432 2] Revision : 0000 [1B2h 0434 2] Length : 002C [1B4h 0436 16] Handler GUID : 657E8AE6-A8FC-4877-BB28-42E7DE1899A5 [1C4h 0452 8] Handler address : 0000000065C567C8 [1CCh 0460 8] Satic Data Address : 000000006113FB98 [1D4h 0468 8] ACPI Parameter Address : 0000000000000000 Fixes: 088984c8d54c ("ACPI: PRM: Find EFI_MEMORY_RUNTIME block for PRM handler and context") Reported-and-tested-by: Shi Liu Cc: All applicable Signed-off-by: Aubrey Li Link: https://uefi.org/sites/default/files/resources/Platform%20Runtime%20Mechanism%20-%20with%20legal%20notice.pdf # [1] Reviewed-by: Koba Ko Acked-by: Ard Biesheuvel Link: https://patch.msgid.link/20250126022250.3014210-1-aubrey.li@linux.intel.com [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/prmt.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c index 747f83f7114d2..e549914a636c6 100644 --- a/drivers/acpi/prmt.c +++ b/drivers/acpi/prmt.c @@ -287,9 +287,7 @@ static acpi_status acpi_platformrt_space_handler(u32 function, if (!handler || !module) goto invalid_guid; - if (!handler->handler_addr || - !handler->static_data_buffer_addr || - !handler->acpi_param_buffer_addr) { + if (!handler->handler_addr) { buffer->prm_status = PRM_HANDLER_ERROR; return AE_OK; } -- GitLab From b1749432a52d3605151634b000fec0361ad45067 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Sat, 1 Feb 2025 12:40:38 -0500 Subject: [PATCH 264/989] rust: kbuild: use host dylib naming in rusttestlib-kernel There seems to have been merge skew between commit b2c261fa8629 ("rust: kbuild: expand rusttest target for macros") and commit 0730422bced5 ("rust: use host dylib naming convention to support macOS") ; the latter replaced `libmacros.so` with `$(libmacros_name)` and the former added an instance of `libmacros.so`. The former was not yet applied when the latter was sent, resulting in a stray `libmacros.so`. Replace the stray with `$(libmacros_name)` to allow `rusttest` to build on macOS. Fixes: 0730422bced5 ("rust: use host dylib naming convention to support macOS") Signed-off-by: Tamir Duberstein Link: https://lore.kernel.org/r/20250201-fix-mac-build-again-v1-1-ca665f5d7de7@gmail.com [ Slightly reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index 8fcfd60447bc8..ff4343ca3f7c4 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -144,7 +144,7 @@ rusttestlib-kernel: private rustc_target_flags = --extern ffi \ --extern bindings --extern uapi rusttestlib-kernel: $(src)/kernel/lib.rs \ rusttestlib-bindings rusttestlib-uapi rusttestlib-build_error \ - $(obj)/libmacros.so $(obj)/bindings.o FORCE + $(obj)/$(libmacros_name) $(obj)/bindings.o FORCE +$(call if_changed,rustc_test_library) rusttestlib-bindings: private rustc_target_flags = --extern ffi -- GitLab From c21bdb3d8a850afdfa4afe77eea39ae9533629b0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 21 Jan 2025 21:09:34 +0100 Subject: [PATCH 265/989] rust: init: use explicit ABI to clean warning in future compilers Starting with Rust 1.86.0 (currently in nightly, to be released on 2025-04-03), the `missing_abi` lint is warn-by-default [1]: error: extern declarations without an explicit ABI are deprecated --> rust/doctests_kernel_generated.rs:3158:1 | 3158 | extern { | ^^^^^^ help: explicitly specify the C ABI: `extern "C"` | = note: `-D missing-abi` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(missing_abi)]` Thus clean it up. Cc: # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Fixes: 7f8977a7fe6d ("rust: init: add `{pin_}chain` functions to `{Pin}Init`") Link: https://github.com/rust-lang/rust/pull/132397 [1] Reviewed-by: Gary Guo Reviewed-by: Alice Ryhl Reviewed-by: Fiona Behrens Link: https://lore.kernel.org/r/20250121200934.222075-1-ojeda@kernel.org [ Added 6.13.y to Cc: stable tag. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/init.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 3f9236c1c9d56..7fd1ea8265a55 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -870,7 +870,7 @@ pub unsafe trait PinInit: Sized { /// use kernel::{types::Opaque, init::pin_init_from_closure}; /// #[repr(C)] /// struct RawFoo([u8; 16]); - /// extern { + /// extern "C" { /// fn init_foo(_: *mut RawFoo); /// } /// -- GitLab From 5368a67307b3b2c347dc8965ac55b888be665934 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 4 Feb 2025 23:19:53 +0100 Subject: [PATCH 266/989] selftests: mptcp: connect: -f: no reconnect The '-f' parameter is there to force the kernel to emit MPTCP FASTCLOSE by closing the connection with unread bytes in the receive queue. The xdisconnect() helper was used to stop the connection, but it does more than that: it will shut it down, then wait before reconnecting to the same address. This causes the mptcp_join's "fastclose test" to fail all the time. This failure is due to a recent change, with commit 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect"), but that went unnoticed because the test is currently ignored. The recent modification only shown an existing issue: xdisconnect() doesn't need to be used here, only the shutdown() part is needed. Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250204-net-mptcp-sft-conn-f-v1-1-6b470c72fffa@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 414addef9a451..d240d02fa443a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1302,7 +1302,7 @@ int main_loop(void) return ret; if (cfg_truncate > 0) { - xdisconnect(fd); + shutdown(fd, SHUT_WR); } else if (--cfg_repeat > 0) { xdisconnect(fd); -- GitLab From 647cef20e649c576dff271e018d5d15d998b629d Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 3 Feb 2025 16:58:38 -0800 Subject: [PATCH 267/989] pfifo_tail_enqueue: Drop new packet when sch->limit == 0 Expected behaviour: In case we reach scheduler's limit, pfifo_tail_enqueue() will drop a packet in scheduler's queue and decrease scheduler's qlen by one. Then, pfifo_tail_enqueue() enqueue new packet and increase scheduler's qlen by one. Finally, pfifo_tail_enqueue() return `NET_XMIT_CN` status code. Weird behaviour: In case we set `sch->limit == 0` and trigger pfifo_tail_enqueue() on a scheduler that has no packet, the 'drop a packet' step will do nothing. This means the scheduler's qlen still has value equal 0. Then, we continue to enqueue new packet and increase scheduler's qlen by one. In summary, we can leverage pfifo_tail_enqueue() to increase qlen by one and return `NET_XMIT_CN` status code. The problem is: Let's say we have two qdiscs: Qdisc_A and Qdisc_B. - Qdisc_A's type must have '->graft()' function to create parent/child relationship. Let's say Qdisc_A's type is `hfsc`. Enqueue packet to this qdisc will trigger `hfsc_enqueue`. - Qdisc_B's type is pfifo_head_drop. Enqueue packet to this qdisc will trigger `pfifo_tail_enqueue`. - Qdisc_B is configured to have `sch->limit == 0`. - Qdisc_A is configured to route the enqueued's packet to Qdisc_B. Enqueue packet through Qdisc_A will lead to: - hfsc_enqueue(Qdisc_A) -> pfifo_tail_enqueue(Qdisc_B) - Qdisc_B->q.qlen += 1 - pfifo_tail_enqueue() return `NET_XMIT_CN` - hfsc_enqueue() check for `NET_XMIT_SUCCESS` and see `NET_XMIT_CN` => hfsc_enqueue() don't increase qlen of Qdisc_A. The whole process lead to a situation where Qdisc_A->q.qlen == 0 and Qdisc_B->q.qlen == 1. Replace 'hfsc' with other type (for example: 'drr') still lead to the same problem. This violate the design where parent's qlen should equal to the sum of its childrens'qlen. Bug impact: This issue can be used for user->kernel privilege escalation when it is reachable. Fixes: 57dbb2d83d10 ("sched: add head drop fifo queue") Reported-by: Quang Le Signed-off-by: Quang Le Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250204005841.223511-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fifo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index b50b2c2cc09bc..e6bfd39ff3396 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -40,6 +40,9 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; + if (unlikely(READ_ONCE(sch->limit) == 0)) + return qdisc_drop(skb, sch, to_free); + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); -- GitLab From 3fe5648d1df1798ce14b5464b2ea49f10cd9db31 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Mon, 3 Feb 2025 16:58:39 -0800 Subject: [PATCH 268/989] selftests/tc-testing: Add a test case for pfifo_head_drop qdisc when limit==0 When limit == 0, pfifo_tail_enqueue() must drop new packet and increase dropped packets count of the qdisc. All test results: 1..16 ok 1 a519 - Add bfifo qdisc with system default parameters on egress ok 2 585c - Add pfifo qdisc with system default parameters on egress ok 3 a86e - Add bfifo qdisc with system default parameters on egress with handle of maximum value ok 4 9ac8 - Add bfifo qdisc on egress with queue size of 3000 bytes ok 5 f4e6 - Add pfifo qdisc on egress with queue size of 3000 packets ok 6 b1b1 - Add bfifo qdisc with system default parameters on egress with invalid handle exceeding maximum value ok 7 8d5e - Add bfifo qdisc on egress with unsupported argument ok 8 7787 - Add pfifo qdisc on egress with unsupported argument ok 9 c4b6 - Replace bfifo qdisc on egress with new queue size ok 10 3df6 - Replace pfifo qdisc on egress with new queue size ok 11 7a67 - Add bfifo qdisc on egress with queue size in invalid format ok 12 1298 - Add duplicate bfifo qdisc on egress ok 13 45a0 - Delete nonexistent bfifo qdisc ok 14 972b - Add prio qdisc on egress with invalid format for handles ok 15 4d39 - Delete bfifo qdisc twice ok 16 d774 - Check pfifo_head_drop qdisc enqueue behaviour when limit == 0 Signed-off-by: Quang Le Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250204005841.223511-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/qdiscs/fifo.json | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json index ae3d286a32b2e..6f20d033670d4 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json @@ -313,6 +313,29 @@ "matchPattern": "qdisc bfifo 1: root", "matchCount": "0", "teardown": [ + ] + }, + { + "id": "d774", + "name": "Check pfifo_head_drop qdisc enqueue behaviour when limit == 0", + "category": [ + "qdisc", + "pfifo_head_drop" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: pfifo_head_drop limit 0", + "$IP link set dev $DUMMY up || true" + ], + "cmdUnderTest": "ping -c2 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "dropped 2", + "matchCount": "1", + "teardown": [ ] } ] -- GitLab From 638ba5089324796c2ee49af10427459c2de35f71 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 3 Feb 2025 16:58:40 -0800 Subject: [PATCH 269/989] netem: Update sch->q.qlen before qdisc_tree_reduce_backlog() qdisc_tree_reduce_backlog() notifies parent qdisc only if child qdisc becomes empty, therefore we need to reduce the backlog of the child qdisc before calling it. Otherwise it would miss the opportunity to call cops->qlen_notify(), in the case of DRR, it resulted in UAF since DRR uses ->qlen_notify() to maintain its active list. Fixes: f8d4bc455047 ("net/sched: netem: account for backlog updates from child qdisc") Cc: Martin Ottens Reported-by: Mingi Cho Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250204005841.223511-4-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_netem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 71ec9986ed37f..fdd79d3ccd8ce 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -749,9 +749,9 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, pkt_len); sch->qstats.backlog -= pkt_len; sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } -- GitLab From 91aadc16ee73cf958be6b0896da3caea49b7f414 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 3 Feb 2025 16:58:41 -0800 Subject: [PATCH 270/989] selftests/tc-testing: Add a test case for qdisc_tree_reduce_backlog() Integrate the test case provided by Mingi Cho into TDC. All test results: 1..4 ok 1 ca5e - Check class delete notification for ffff: ok 2 e4b7 - Check class delete notification for root ffff: ok 3 33a9 - Check ingress is not searchable on backlog update ok 4 a4b9 - Test class qlen notification Cc: Mingi Cho Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250204005841.223511-5-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index d3dd65b05b5f1..9044ac0541672 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -94,5 +94,37 @@ "$TC qdisc del dev $DUMMY ingress", "$IP addr del 10.10.10.10/24 dev $DUMMY" ] - } + }, + { + "id": "a4b9", + "name": "Test class qlen notification", + "category": [ + "qdisc" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: drr", + "$TC filter add dev $DUMMY parent 1: basic classid 1:1", + "$TC class add dev $DUMMY parent 1: classid 1:1 drr", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2: netem", + "$TC qdisc add dev $DUMMY parent 2: handle 3: drr", + "$TC filter add dev $DUMMY parent 3: basic action drop", + "$TC class add dev $DUMMY parent 3: classid 3:1 drr", + "$TC class del dev $DUMMY classid 1:1", + "$TC class add dev $DUMMY parent 1: classid 1:1 drr" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC qdisc ls dev $DUMMY", + "matchPattern": "drr 1: root", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY root handle 1: drr", + "$IP addr del 10.10.10.10/24 dev $DUMMY" + ] + } ] -- GitLab From a70c7b3cbc0688016810bb2e0b9b8a0d6a530045 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 4 Feb 2025 11:10:06 -0500 Subject: [PATCH 271/989] tun: revert fix group permission check This reverts commit 3ca459eaba1bf96a8c7878de84fa8872259a01e3. The blamed commit caused a regression when neither tun->owner nor tun->group is set. This is intended to be allowed, but now requires CAP_NET_ADMIN. Discussion in the referenced thread pointed out that the original issue that prompted this patch can be resolved in userspace. The relaxed access control may also make a device accessible when it previously wasn't, while existing users may depend on it to not be. This is a clean pure git revert, except for fixing the indentation on the gid_valid line that checkpatch correctly flagged. Fixes: 3ca459eaba1b ("tun: fix group permission check") Link: https://lore.kernel.org/netdev/CAFqZXNtkCBT4f+PwyVRmQGoT3p1eVa01fCG_aNtpt6dakXncUg@mail.gmail.com/ Signed-off-by: Willem de Bruijn Cc: Ondrej Mosnacek Cc: Stas Sergeev Link: https://patch.msgid.link/20250204161015.739430-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 28624cca91f8d..acf96f2624887 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -574,18 +574,14 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, return ret; } -static inline bool tun_capable(struct tun_struct *tun) +static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); - if (ns_capable(net->user_ns, CAP_NET_ADMIN)) - return 1; - if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner)) - return 1; - if (gid_valid(tun->group) && in_egroup_p(tun->group)) - return 1; - return 0; + return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || + (gid_valid(tun->group) && !in_egroup_p(tun->group))) && + !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) @@ -2782,7 +2778,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; - if (!tun_capable(tun)) + if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) -- GitLab From 811b8f534fd85e17077bd2ac0413bcd16cc8fb9b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 14:38:39 +0200 Subject: [PATCH 272/989] net: sched: Fix truncation of offloaded action statistics In case of tc offload, when user space queries the kernel for tc action statistics, tc will query the offloaded statistics from device drivers. Among other statistics, drivers are expected to pass the number of packets that hit the action since the last query as a 64-bit number. Unfortunately, tc treats the number of packets as a 32-bit number, leading to truncation and incorrect statistics when the number of packets since the last query exceeds 0xffffffff: $ tc -s filter show dev swp2 ingress filter protocol all pref 1 flower chain 0 filter protocol all pref 1 flower chain 0 handle 0x1 skip_sw in_hw in_hw_count 1 action order 1: mirred (Egress Redirect to device swp1) stolen index 1 ref 1 bind 1 installed 58 sec used 0 sec Action statistics: Sent 1133877034176 bytes 536959475 pkt (dropped 0, overlimits 0 requeues 0) [...] According to the above, 2111-byte packets were redirected which is impossible as only 64-byte packets were transmitted and the MTU was 1500. Fix by treating packets as a 64-bit number: $ tc -s filter show dev swp2 ingress filter protocol all pref 1 flower chain 0 filter protocol all pref 1 flower chain 0 handle 0x1 skip_sw in_hw in_hw_count 1 action order 1: mirred (Egress Redirect to device swp1) stolen index 1 ref 1 bind 1 installed 61 sec used 0 sec Action statistics: Sent 1370624380864 bytes 21416005951 pkt (dropped 0, overlimits 0 requeues 0) [...] Which shows that only 64-byte packets were redirected (1370624380864 / 21416005951 = 64). Fixes: 380407023526 ("net/sched: Enable netdev drivers to update statistics of offloaded actions") Reported-by: Joe Botha Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250204123839.1151804-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d635c5b47ebaf..d48c657191cd0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -851,7 +851,7 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, - __u64 bytes, __u32 packets) + __u64 bytes, __u64 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); -- GitLab From 41b996ce83bf944de5569d6263c8dbd5513e7ed0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 4 Feb 2025 23:05:53 +0000 Subject: [PATCH 273/989] rxrpc: Fix call state set to not include the SERVER_SECURING state The RXRPC_CALL_SERVER_SECURING state doesn't really belong with the other states in the call's state set as the other states govern the call's Rx/Tx phase transition and govern when packets can and can't be received or transmitted. The "Securing" state doesn't actually govern the reception of packets and would need to be split depending on whether or not we've received the last packet yet (to mirror RECV_REQUEST/ACK_REQUEST). The "Securing" state is more about whether or not we can start forwarding packets to the application as recvmsg will need to decode them and the decoding can't take place until the challenge/response exchange has completed. Fix this by removing the RXRPC_CALL_SERVER_SECURING state from the state set and, instead, using a flag, RXRPC_CALL_CONN_CHALLENGING, to track whether or not we can queue the call for reception by recvmsg() or notify the kernel app that data is ready. In the event that we've already received all the packets, the connection event handler will poke the app layer in the appropriate manner. Also there's a race whereby the app layer sees the last packet before rxrpc has managed to end the rx phase and change the state to one amenable to allowing a reply. Fix this by queuing the packet after calling rxrpc_end_rx_phase(). Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250204230558.712536-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/call_object.c | 6 ++---- net/rxrpc/conn_event.c | 4 +--- net/rxrpc/input.c | 2 +- net/rxrpc/sendmsg.c | 2 +- 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 718193df9d2e2..f251845fe532c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -582,6 +582,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -602,7 +603,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 5a543c3f6fb08..c4c8b46a68c67 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -453,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 74bb49b936cd4..4d9c5e21ba785 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -228,10 +228,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4974b5accafa3..4a152f3c831fd 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -657,7 +657,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb rxrpc_propose_delay_ACK(call, sp->hdr.serial, rxrpc_propose_ack_input_data); } - if (notify) { + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0e8da909d4f2f..584397aba4a07 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -707,7 +707,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; -- GitLab From 2d7b30aef34dae942e9ab7812b288ce14658ae66 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 4 Feb 2025 23:05:54 +0000 Subject: [PATCH 274/989] rxrpc: Fix race in call state changing vs recvmsg() There's a race in between the rxrpc I/O thread recording the end of the receive phase of a call and recvmsg() examining the state of the call to determine whether it has completed. The problem is that call->_state records the I/O thread's view of the call, not the application's view (which may lag), so that alone is not sufficient. To this end, the application also checks whether there is anything left in call->recvmsg_queue for it to pick up. The call must be in state RXRPC_CALL_COMPLETE and the recvmsg_queue empty for the call to be considered fully complete. In rxrpc_input_queue_data(), the latest skbuff is added to the queue and then, if it was marked as LAST_PACKET, the state is advanced... But this is two separate operations with no locking around them. As a consequence, the lack of locking means that sendmsg() can jump into the gap on a service call and attempt to send the reply - but then get rejected because the I/O thread hasn't advanced the state yet. Simply flipping the order in which things are done isn't an option as that impacts the client side, causing the checks in rxrpc_kernel_check_life() as to whether the call is still alive to race instead. Fix this by moving the update of call->_state inside the skb queue spinlocked section where the packet is queued on the I/O thread side. rxrpc's recvmsg() will then automatically sync against this because it has to take the call->recvmsg_queue spinlock in order to dequeue the last packet. rxrpc's sendmsg() doesn't need amending as the app shouldn't be calling it to send a reply until recvmsg() indicates it has returned all of the request. Fixes: 93368b6bd58a ("rxrpc: Move call state changes from recvmsg to I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250204230558.712536-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/input.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4a152f3c831fd..9047ba13bd31e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -448,11 +448,19 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - skb_queue_tail(&call->recvmsg_queue, skb); + spin_lock_irq(&call->recvmsg_queue.lock); + + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* -- GitLab From 77c2e45dbf9d2ced21d2cf6cc3b2a048d57ab7ad Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 5 Feb 2025 13:03:33 -0300 Subject: [PATCH 275/989] smb: client: don't trust DFSREF_STORAGE_SERVER bit Some servers don't respect the DFSREF_STORAGE_SERVER bit, so unconditionally tree connect to DFS link target and then decide whether or not continue chasing DFS referrals for DFS interlinks. Otherwise the client would fail to mount such shares. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/dfs.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index dad521336b5ee..f65a8a90ba279 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -150,25 +150,27 @@ static int __dfs_referral_walk(struct dfs_ref_walk *rw) if (rc) continue; - if (tgt.flags & DFSREF_STORAGE_SERVER) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (!rc) - rc = cifs_is_path_remote(mnt_ctx); + rc = cifs_mount_get_tcon(mnt_ctx); + if (rc) { + if (tgt.server_type == DFS_TYPE_LINK && + DFS_INTERLINK(tgt.flags)) + rc = -EREMOTE; + } else { + rc = cifs_is_path_remote(mnt_ctx); if (!rc) { ref_walk_set_tgt_hint(rw); break; } - if (rc != -EREMOTE) - continue; } - - rc = ref_walk_advance(rw); - if (!rc) { - rc = setup_dfs_ref(&tgt, rw); - if (rc) - break; - ref_walk_mark_end(rw); - goto again; + if (rc == -EREMOTE) { + rc = ref_walk_advance(rw); + if (!rc) { + rc = setup_dfs_ref(&tgt, rw); + if (rc) + break; + ref_walk_mark_end(rw); + goto again; + } } } } while (rc && ref_walk_descend(rw)); -- GitLab From 773dc23ff81838b6f74d7fabba5a441cc6a93982 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 5 Feb 2025 13:22:11 -0300 Subject: [PATCH 276/989] smb: client: fix noisy when tree connecting to DFS interlink targets When the client attempts to tree connect to a domain-based DFS namespace from a DFS interlink target, the server will return STATUS_BAD_NETWORK_NAME and the following will appear on dmesg: CIFS: VFS: BAD_NETWORK_NAME: \\dom\dfs Since a DFS share might contain several DFS interlinks and they expire after 10 minutes, the above message might end up being flooded on dmesg when mounting or accessing them. Print this only once per share. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 40ad9e79437a4..78395195e0165 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2169,7 +2169,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon_error_exit: if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) - cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); + cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree); goto tcon_exit; } -- GitLab From be1963dd4ce4e467f062b023d1e696f40c926a04 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 5 Feb 2025 13:41:32 -0300 Subject: [PATCH 277/989] smb: client: get rid of kstrdup() in get_ses_refpath() After commit 36008fe6e3dc ("smb: client: don't try following DFS links in cifs_tree_connect()"), TCP_Server_Info::leaf_fullpath will no longer be changed, so there is no need to kstrdup() it. Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/dfs.h | 7 +++++++ fs/smb/client/dfs_cache.c | 27 +++++---------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index ed4cd7cf1ec64..e60f0a24a8a1d 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -188,4 +188,11 @@ static inline void dfs_put_root_smb_sessions(struct list_head *head) } } +static inline const char *dfs_ses_refpath(struct cifs_ses *ses) +{ + const char *path = ses->server->leaf_fullpath; + + return path ? path + 1 : ERR_PTR(-ENOENT); +} + #endif /* _CIFS_DFS_H */ diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 5022bb1f122a1..4dada26d56b5f 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1136,33 +1136,19 @@ static bool is_ses_good(struct cifs_ses *ses) return ret; } -static char *get_ses_refpath(struct cifs_ses *ses) -{ - struct TCP_Server_Info *server = ses->server; - char *path = ERR_PTR(-ENOENT); - - if (server->leaf_fullpath) { - path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL); - if (!path) - path = ERR_PTR(-ENOMEM); - } - return path; -} - /* Refresh dfs referral of @ses */ static void refresh_ses_referral(struct cifs_ses *ses) { struct cache_entry *ce; unsigned int xid; - char *path; + const char *path; int rc = 0; xid = get_xid(); - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1181,7 +1167,6 @@ static void refresh_ses_referral(struct cifs_ses *ses) out: free_xid(xid); - kfree(path); } static int __refresh_tcon_referral(struct cifs_tcon *tcon, @@ -1231,19 +1216,18 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) struct dfs_info3_param *refs = NULL; struct cache_entry *ce; struct cifs_ses *ses; - unsigned int xid; bool needs_refresh; - char *path; + const char *path; + unsigned int xid; int numrefs = 0; int rc = 0; xid = get_xid(); ses = tcon->ses; - path = get_ses_refpath(ses); + path = dfs_ses_refpath(ses); if (IS_ERR(path)) { rc = PTR_ERR(path); - path = NULL; goto out; } @@ -1271,7 +1255,6 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh) out: free_xid(xid); - kfree(path); free_dfs_info_array(refs, numrefs); } -- GitLab From c7691aec5e991cec9c5c5fdab08c24856a1fc56f Mon Sep 17 00:00:00 2001 From: Vaishnav Achath Date: Wed, 5 Feb 2025 11:52:29 +0530 Subject: [PATCH 278/989] arm64: defconfig: Enable TISCI Interrupt Router and Aggregator Enable TISCI Interrupt Router and Interrupt Aggregator drivers. These IPs are found in all TI K3 SoCs like J721E, AM62X and is required for core functionality like DMA, GPIO Interrupts which is necessary during boot, thus make them built-in. bloat-o-meter summary on vmlinux: add/remove: 460/1 grow/shrink: 4/0 up/down: 162483/-8 (162475) ... Total: Before=31615984, After=31778459, chg +0.51% These configs were previously selected for ARCH_K3 in respective Kconfigs till commit b8b26ae398c4 ("irqchip/ti-sci-inta : Add module build support") and commit 2d95ffaecbc2 ("irqchip/ti-sci-intr: Add module build support") dropped them and few driver configs (TI_K3_UDMA, TI_K3_RINGACC) dependent on these also got disabled due to this. While re-enabling the TI_SCI_INT_*_IRQCHIP configs, these configs with missing dependencies (which are already part of arm64 defconfig) also get re-enabled which explains the slightly larger size increase from the bloat-o-meter summary. Fixes: 2d95ffaecbc2 ("irqchip/ti-sci-intr: Add module build support") Fixes: b8b26ae398c4 ("irqchip/ti-sci-inta : Add module build support") Signed-off-by: Vaishnav Achath Tested-by: Dhruva Gole Reviewed-by: Dhruva Gole Link: https://lore.kernel.org/r/20250205062229.3869081-1-vaishnav.a@ti.com Signed-off-by: Nishanth Menon --- arch/arm64/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index cb7da44155999..1f25423de3833 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1551,6 +1551,8 @@ CONFIG_PWM_VISCONTI=m CONFIG_SL28CPLD_INTC=y CONFIG_QCOM_PDC=y CONFIG_QCOM_MPM=y +CONFIG_TI_SCI_INTR_IRQCHIP=y +CONFIG_TI_SCI_INTA_IRQCHIP=y CONFIG_RESET_GPIO=m CONFIG_RESET_IMX7=y CONFIG_RESET_QCOM_AOSS=y -- GitLab From cc668a11e6ac8adb0e016711080d3f314722cc91 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 3 Feb 2025 14:50:59 +0200 Subject: [PATCH 279/989] RDMA/mlx5: Fix a race for DMABUF MR which can lead to CQE with error This patch addresses a potential race condition for a DMABUF MR that can result in a CQE with an error on the UMR QP. During the __mlx5_ib_dereg_mr() flow, the following sequence of calls occurs: mlx5_revoke_mr() mlx5r_umr_revoke_mr() mlx5r_umr_post_send_wait() At this point, the lkey is freed from the hardware's perspective. However, concurrently, mlx5_ib_dmabuf_invalidate_cb() might be triggered by another task attempting to invalidate the MR having that freed lkey. Since the lkey has already been freed, this can lead to a CQE error, causing the UMR QP to enter an error state. To resolve this race condition, the dma_resv_lock() which was hold as part of the mlx5_ib_dmabuf_invalidate_cb() is now also acquired as part of the mlx5_revoke_mr() scope. Upon a successful revoke, we set umem_dmabuf->private which points to that MR to NULL, preventing any further invalidation attempts on its lkey. Fixes: e6fb246ccafb ("RDMA/mlx5: Consolidate MR destruction to mlx5_ib_dereg_mr()") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Link: https://patch.msgid.link/70617067abbfaa0c816a2544c922e7f4346def58.1738587016.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bb02b6adbf2c2..0a3cbb14e1839 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1550,7 +1550,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); - if (!umem_dmabuf->sgt) + if (!umem_dmabuf->sgt || !mr) return; mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); @@ -2022,11 +2022,16 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; bool is_odp = is_odp_mr(mr); + bool is_odp_dma_buf = is_dmabuf_mr(mr) && + !to_ib_umem_dmabuf(mr->umem)->pinned; int ret = 0; if (is_odp) mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); + if (is_odp_dma_buf) + dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL); + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { ent = mr->mmkey.cache_ent; /* upon storing to a clean temp entry - schedule its cleanup */ @@ -2054,6 +2059,12 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); } + if (is_odp_dma_buf) { + if (!ret) + to_ib_umem_dmabuf(mr->umem)->private = NULL; + dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv); + } + return ret; } -- GitLab From abc7b3f1f056d69a8f11d6dceecc0c9549ace770 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 3 Feb 2025 14:51:43 +0200 Subject: [PATCH 280/989] RDMA/mlx5: Fix a WARN during dereg_mr for DM type Memory regions (MR) of type DM (device memory) do not have an associated umem. In the __mlx5_ib_dereg_mr() -> mlx5_free_priv_descs() flow, the code incorrectly takes the wrong branch, attempting to call dma_unmap_single() on a DMA address that is not mapped. This results in a WARN [1], as shown below. The issue is resolved by properly accounting for the DM type and ensuring the correct branch is selected in mlx5_free_priv_descs(). [1] WARNING: CPU: 12 PID: 1346 at drivers/iommu/dma-iommu.c:1230 iommu_dma_unmap_page+0x79/0x90 Modules linked in: ip6table_mangle ip6table_nat ip6table_filter ip6_tables iptable_mangle xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry ovelay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core fuse mlx5_core CPU: 12 UID: 0 PID: 1346 Comm: ibv_rc_pingpong Not tainted 6.12.0-rc7+ #1631 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:iommu_dma_unmap_page+0x79/0x90 Code: 2b 49 3b 29 72 26 49 3b 69 08 73 20 4d 89 f0 44 89 e9 4c 89 e2 48 89 ee 48 89 df 5b 5d 41 5c 41 5d 41 5e 41 5f e9 07 b8 88 ff <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 66 0f 1f 44 00 RSP: 0018:ffffc90001913a10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88810194b0a8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000001 RBP: ffff88810194b0a8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f537abdd740(0000) GS:ffff88885fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f537aeb8000 CR3: 000000010c248001 CR4: 0000000000372eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x84/0x190 ? iommu_dma_unmap_page+0x79/0x90 ? report_bug+0xf8/0x1c0 ? handle_bug+0x55/0x90 ? exc_invalid_op+0x13/0x60 ? asm_exc_invalid_op+0x16/0x20 ? iommu_dma_unmap_page+0x79/0x90 dma_unmap_page_attrs+0xe6/0x290 mlx5_free_priv_descs+0xb0/0xe0 [mlx5_ib] __mlx5_ib_dereg_mr+0x37e/0x520 [mlx5_ib] ? _raw_spin_unlock_irq+0x24/0x40 ? wait_for_completion+0xfe/0x130 ? rdma_restrack_put+0x63/0xe0 [ib_core] ib_dereg_mr_user+0x5f/0x120 [ib_core] ? lock_release+0xc6/0x280 destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs] uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs] uobj_destroy+0x3f/0x70 [ib_uverbs] ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs] ? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs] ? lock_acquire+0xc1/0x2f0 ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] ? ib_uverbs_ioctl+0x116/0x170 [ib_uverbs] ? lock_release+0xc6/0x280 ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs] ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] __x64_sys_ioctl+0x1b0/0xa70 do_syscall_64+0x6b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f537adaf17b Code: 0f 1e fa 48 8b 05 1d ad 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ed ac 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffff218f0b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffff218f1d8 RCX: 00007f537adaf17b RDX: 00007ffff218f1c0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffff218f1a0 R08: 00007f537aa8d010 R09: 0000561ee2e4f270 R10: 00007f537aace3a8 R11: 0000000000000246 R12: 00007ffff218f190 R13: 000000000000001c R14: 0000561ee2e4d7c0 R15: 00007ffff218f450 Fixes: f18ec4223117 ("RDMA/mlx5: Use a union inside mlx5_ib_mr") Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/2039c22cfc3df02378747ba4d623a558b53fc263.1738587076.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0a3cbb14e1839..753faa9ad06a8 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1935,7 +1935,8 @@ mlx5_alloc_priv_descs(struct ib_device *device, static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && !mr->data_direct && mr->descs) { + if (!mr->umem && !mr->data_direct && + mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); -- GitLab From 29b7bb98234cc287cebef9bccf638c2e3f39be71 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 5 Feb 2025 02:30:05 -0800 Subject: [PATCH 281/989] RDMA/mana_ib: Allocate PAGE aligned doorbell index Allocate a PAGE aligned doorbell index to ensure each process gets a separate PAGE sized doorbell area space remapped to it in mana_ib_mmap Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Signed-off-by: Shiraz Saleem Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1738751405-15041-1-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index 67c2d43135a8a..457cea6d99095 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -174,7 +174,7 @@ static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; req.num_resources = 1; - req.alignment = 1; + req.alignment = PAGE_SIZE / MANA_PAGE_SIZE; /* Have GDMA start searching from 0 */ req.allocated_resources = 0; -- GitLab From 0af4c120f5e7a1ea70aff7da2dfb65b6148a3e84 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 5 Feb 2025 12:10:58 +0200 Subject: [PATCH 282/989] pinctrl: pinconf-generic: Print unsigned value if a format is registered Commit 3ba11e684d16 ("pinctrl: pinconf-generic: print hex value") unconditionally switched to printing hex values in pinconf_generic_dump_one(). However, if a dump format is registered for the dumped pin, the hex value is printed as well. This hex value does not necessarily correspond 1:1 with the hardware register value (as noted by commit 3ba11e684d16 ("pinctrl: pinconf-generic: print hex value")). As a result, user-facing output may include information like: output drive strength (0x100 uA). To address this, check if a dump format is registered for the dumped property, and print the unsigned value instead when applicable. Fixes: 3ba11e684d16 ("pinctrl: pinconf-generic: print hex value") Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/20250205101058.2034860-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index 0b13d7f17b325..42547f64453e8 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -89,12 +89,12 @@ static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev, seq_puts(s, items[i].display); /* Print unit if available */ if (items[i].has_arg) { - seq_printf(s, " (0x%x", - pinconf_to_config_argument(config)); + u32 val = pinconf_to_config_argument(config); + if (items[i].format) - seq_printf(s, " %s)", items[i].format); + seq_printf(s, " (%u %s)", val, items[i].format); else - seq_puts(s, ")"); + seq_printf(s, " (0x%x)", val); } } } -- GitLab From 1e3835a8aea5118d58ff9daa656395e69c8806b2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Feb 2025 13:57:29 -0800 Subject: [PATCH 283/989] MAINTAINERS: add entry for ethtool Michal did an amazing job converting ethtool to Netlink, but never added an entry to MAINTAINERS for himself. Create a formal entry so that we can delegate (portions) of this code to folks. Over the last 3 years majority of the reviews have been done by Andrew and I. I suppose Michal didn't want to be on the receiving end of the flood of patches. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250204215729.168992-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 74b09dad46626..20c8daf3ce620 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16455,6 +16455,16 @@ F: include/net/dsa.h F: net/dsa/ F: tools/testing/selftests/drivers/net/dsa/ +NETWORKING [ETHTOOL] +M: Andrew Lunn +M: Jakub Kicinski +F: Documentation/netlink/specs/ethtool.yaml +F: Documentation/networking/ethtool-netlink.rst +F: include/linux/ethtool* +F: include/uapi/linux/ethtool* +F: net/ethtool/ +F: tools/testing/selftests/drivers/net/*/ethtool* + NETWORKING [GENERAL] M: "David S. Miller" M: Eric Dumazet -- GitLab From 82b02a7c459922bbf80e45d5f7e2c4cfef617943 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Feb 2025 13:57:50 -0800 Subject: [PATCH 284/989] MAINTAINERS: add a sample ethtool section entry I feel like we don't do a good enough keeping authors of driver APIs around. The ethtool code base was very nicely compartmentalized by Michal. Establish a precedent of creating MAINTAINERS entries for "sections" of the ethtool API. Use Andrew and cable test as a sample entry. The entry should ideally cover 3 elements: a core file, test(s), and keywords. The last one is important because we intend the entries to cover core code *and* reviews of drivers implementing given API! Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250204215750.169249-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 20c8daf3ce620..bd705e9123a3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16465,6 +16465,12 @@ F: include/uapi/linux/ethtool* F: net/ethtool/ F: tools/testing/selftests/drivers/net/*/ethtool* +NETWORKING [ETHTOOL CABLE TEST] +M: Andrew Lunn +F: net/ethtool/cabletest.c +F: tools/testing/selftests/drivers/net/*/ethtool* +K: cable_test + NETWORKING [GENERAL] M: "David S. Miller" M: Eric Dumazet -- GitLab From ba958ac74800573f7f54dbe2a7a7b9a9a523ed52 Mon Sep 17 00:00:00 2001 From: Oleh Zadorozhnyi Date: Tue, 4 Feb 2025 07:17:30 +0200 Subject: [PATCH 285/989] kbuild: fix misspelling in scripts/Makefile.lib Signed-off-by: Oleh Zadorozhnyi Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ad55ef201aacb..cad20f0e66ee9 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -305,7 +305,7 @@ endef # These are shared by some Makefile.* files. ifdef CONFIG_LTO_CLANG -# Run $(LD) here to covert LLVM IR to ELF in the following cases: +# Run $(LD) here to convert LLVM IR to ELF in the following cases: # - when this object needs objtool processing, as objtool cannot process LLVM IR # - when this is a single-object module, as modpost cannot process LLVM IR cmd_ld_single = $(if $(objtool-enabled)$(is-single-obj-m), ; $(LD) $(ld_flags) -r -o $(tmp-target) $@; mv $(tmp-target) $@) -- GitLab From 5da7e15fb5a12e78de974d8908f348e279922ce9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 31 Jan 2025 19:01:42 -0800 Subject: [PATCH 286/989] net: Add rx_skb of kfree_skb to raw_tp_null_args[]. Yan Zhai reported a BPF prog could trigger a null-ptr-deref [0] in trace_kfree_skb if the prog does not check if rx_sk is NULL. Commit c53795d48ee8 ("net: add rx_sk to trace_kfree_skb") added rx_sk to trace_kfree_skb, but rx_sk is optional and could be NULL. Let's add kfree_skb to raw_tp_null_args[] to let the BPF verifier validate such a prog and prevent the issue. Now we fail to load such a prog: libbpf: prog 'drop': -- BEGIN PROG LOAD LOG -- 0: R1=ctx() R10=fp0 ; int BPF_PROG(drop, struct sk_buff *skb, void *location, @ kfree_skb_sk_null.bpf.c:21 0: (79) r3 = *(u64 *)(r1 +24) func 'kfree_skb' arg3 has btf_id 5253 type STRUCT 'sock' 1: R1=ctx() R3_w=trusted_ptr_or_null_sock(id=1) ; bpf_printk("sk: %d, %d\n", sk, sk->__sk_common.skc_family); @ kfree_skb_sk_null.bpf.c:24 1: (69) r4 = *(u16 *)(r3 +16) R3 invalid mem access 'trusted_ptr_or_null_' processed 2 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 -- END PROG LOAD LOG -- Note this fix requires commit 838a10bd2ebf ("bpf: Augment raw_tp arguments with PTR_MAYBE_NULL"). [0]: BUG: kernel NULL pointer dereference, address: 0000000000000010 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 0 P4D 0 PREEMPT SMP RIP: 0010:bpf_prog_5e21a6db8fcff1aa_drop+0x10/0x2d Call Trace: ? __die+0x1f/0x60 ? page_fault_oops+0x148/0x420 ? search_bpf_extables+0x5b/0x70 ? fixup_exception+0x27/0x2c0 ? exc_page_fault+0x75/0x170 ? asm_exc_page_fault+0x22/0x30 ? bpf_prog_5e21a6db8fcff1aa_drop+0x10/0x2d bpf_trace_run4+0x68/0xd0 ? unix_stream_connect+0x1f4/0x6f0 sk_skb_reason_drop+0x90/0x120 unix_stream_connect+0x1f4/0x6f0 __sys_connect+0x7f/0xb0 __x64_sys_connect+0x14/0x20 do_syscall_64+0x47/0xc30 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: c53795d48ee8 ("net: add rx_sk to trace_kfree_skb") Reported-by: Yan Zhai Closes: https://lore.kernel.org/netdev/Z50zebTRzI962e6X@debian.debian/ Signed-off-by: Kuniyuki Iwashima Tested-by: Yan Zhai Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20250201030142.62703-1-kuniyu@amazon.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9de6acddd479b..c3223e0db2f51 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6507,6 +6507,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + /* skb */ + {"kfree_skb", 0x1000}, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ -- GitLab From 2a64c96356c87aa8af826605943e5524bf45e24d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 5 Feb 2025 12:57:47 +0000 Subject: [PATCH 287/989] Revert "net: stmmac: Specify hardware capability value when FIFO size isn't specified" This reverts commit 8865d22656b4, which caused breakage for platforms which are not using xgmac2 or gmac4. Only these two cores have the capability of providing the FIFO sizes from hardware capability fields (which are provided in priv->dma_cap.[tr]x_fifo_size.) All other cores can not, which results in these two fields containing zero. We also have platforms that do not provide a value in priv->plat->[tr]x_fifo_size, resulting in these also being zero. This causes the new tests introduced by the reverted commit to fail, and produce e.g.: stmmaceth f0804000.eth: Can't specify Rx FIFO size An example of such a platform which fails is QEMU's npcm750-evb. This uses dwmac1000 which, as noted above, does not have the capability to provide the FIFO sizes from hardware. Therefore, revert the commit to maintain compatibility with the way the driver used to work. Reported-by: Guenter Roeck Link: https://lore.kernel.org/r/4e98f967-f636-46fb-9eca-d383b9495b86@roeck-us.net Signed-off-by: Russell King (Oracle) Tested-by: Steven Price Fixes: 8865d22656b4 ("net: stmmac: Specify hardware capability value when FIFO size isn't specified") Link: https://patch.msgid.link/E1tfeyR-003YGJ-Gb@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d04543e5697b0..b34ebb916b898 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2424,6 +2424,11 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) u32 chan = 0; u8 qmode = 0; + if (rxfifosz == 0) + rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + /* Split up the shared Tx/Rx FIFO memory on DW QoS Eth and DW XGMAC */ if (priv->plat->has_gmac4 || priv->plat->has_xgmac) { rxfifosz /= rx_channels_count; @@ -2892,6 +2897,11 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; + if (rxfifosz == 0) + rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + /* Adjust for real per queue fifo size */ rxfifosz /= rx_channels_count; txfifosz /= tx_channels_count; @@ -5868,6 +5878,9 @@ static int stmmac_change_mtu(struct net_device *dev, int new_mtu) const int mtu = new_mtu; int ret; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + txfifosz /= priv->plat->tx_queues_to_use; if (stmmac_xdp_is_enabled(priv) && new_mtu > ETH_DATA_LEN) { @@ -7219,29 +7232,15 @@ static int stmmac_hw_init(struct stmmac_priv *priv) priv->plat->tx_queues_to_use = priv->dma_cap.number_tx_queues; } - if (!priv->plat->rx_fifo_size) { - if (priv->dma_cap.rx_fifo_size) { - priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; - } else { - dev_err(priv->device, "Can't specify Rx FIFO size\n"); - return -ENODEV; - } - } else if (priv->dma_cap.rx_fifo_size && - priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { + if (priv->dma_cap.rx_fifo_size && + priv->plat->rx_fifo_size > priv->dma_cap.rx_fifo_size) { dev_warn(priv->device, "Rx FIFO size (%u) exceeds dma capability\n", priv->plat->rx_fifo_size); priv->plat->rx_fifo_size = priv->dma_cap.rx_fifo_size; } - if (!priv->plat->tx_fifo_size) { - if (priv->dma_cap.tx_fifo_size) { - priv->plat->tx_fifo_size = priv->dma_cap.tx_fifo_size; - } else { - dev_err(priv->device, "Can't specify Tx FIFO size\n"); - return -ENODEV; - } - } else if (priv->dma_cap.tx_fifo_size && - priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { + if (priv->dma_cap.tx_fifo_size && + priv->plat->tx_fifo_size > priv->dma_cap.tx_fifo_size) { dev_warn(priv->device, "Tx FIFO size (%u) exceeds dma capability\n", priv->plat->tx_fifo_size); -- GitLab From 3588b1c0fde2f58d166e3f94a5a58d64b893526c Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 6 Feb 2025 17:57:47 +0900 Subject: [PATCH 288/989] spi: sn-f-ospi: Fix division by zero When there is no dummy cycle in the spi-nor commands, both dummy bus cycle bytes and width are zero. Because of the cpu's warning when divided by zero, the warning should be avoided. Return just zero to avoid such calculations. Fixes: 1b74dd64c861 ("spi: Add Socionext F_OSPI SPI flash controller driver") Co-developed-by: Kohei Ito Signed-off-by: Kohei Ito Signed-off-by: Kunihiko Hayashi Link: https://patch.msgid.link/20250206085747.3834148-1-hayashi.kunihiko@socionext.com Signed-off-by: Mark Brown --- drivers/spi/spi-sn-f-ospi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/spi/spi-sn-f-ospi.c b/drivers/spi/spi-sn-f-ospi.c index 6ad4b729897e3..c4969f66a0ba9 100644 --- a/drivers/spi/spi-sn-f-ospi.c +++ b/drivers/spi/spi-sn-f-ospi.c @@ -116,6 +116,9 @@ struct f_ospi { static u32 f_ospi_get_dummy_cycle(const struct spi_mem_op *op) { + if (!op->dummy.nbytes) + return 0; + return (op->dummy.nbytes * 8) / op->dummy.buswidth; } -- GitLab From 517e8a7835e8cfb398a0aeb0133de50e31cae32b Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 5 Feb 2025 17:00:59 +0000 Subject: [PATCH 289/989] bpf: Fix softlockup in arena_map_free on 64k page kernel On an aarch64 kernel with CONFIG_PAGE_SIZE_64KB=y, arena_htab tests cause a segmentation fault and soft lockup. The same failure is not observed with 4k pages on aarch64. It turns out arena_map_free() is calling apply_to_existing_page_range() with the address returned by bpf_arena_get_kern_vm_start(). If this address is not page-aligned the code ends up calling apply_to_pte_range() with that unaligned address causing soft lockup. Fix it by round up GUARD_SZ to PAGE_SIZE << 1 so that the division by 2 in bpf_arena_get_kern_vm_start() returns a page-aligned value. Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Reported-by: Colm Harrington Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Link: https://lore.kernel.org/r/20250205170059.427458-1-alan.maguire@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 870aeb51d70ad..095a9554e1def 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -39,7 +39,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) +#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { -- GitLab From fb97bc2e47f694f79d6358d981ae0428db8e8088 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:53 +0100 Subject: [PATCH 290/989] drm/tests: hdmi: Fix WW_MUTEX_SLOWPATH failures The light_up_connector helper function in the HDMI infrastructure unit tests uses drm_atomic_set_crtc_for_connector(), but fails when it returns an error. This function can return EDEADLK though if the sequence needs to be restarted, and WW_MUTEX_SLOWPATH is meant to test that we handle it properly. Let's handle EDEADLK and restart the sequence in our tests as well. Fixes: eb66d34d793e ("drm/tests: Add output bpc tests") Reported-by: Dave Airlie Closes: https://lore.kernel.org/r/CAPM=9tzJ4-ERDxvuwrCyUPY0=+P44orhp1kLWVGL7MCfpQjMEQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20241031091558.2435850-1-mripard@kernel.org Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-1-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index b976a5e9aef58..8e6eb94075a5e 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -70,10 +70,17 @@ static int light_up_connector(struct kunit *test, state = drm_kunit_helper_atomic_state_alloc(test, drm, ctx); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, state); +retry: conn_state = drm_atomic_get_connector_state(state, connector); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, conn_state); ret = drm_atomic_set_crtc_for_connector(conn_state, crtc); + if (ret == -EDEADLK) { + drm_atomic_state_clear(state); + ret = drm_modeset_backoff(ctx); + if (!ret) + goto retry; + } KUNIT_EXPECT_EQ(test, ret, 0); crtc_state = drm_atomic_get_crtc_state(state, crtc); -- GitLab From bb4f929a8875b4801db95b8cf3b2c527c1e475e0 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:54 +0100 Subject: [PATCH 291/989] drm/tests: hdmi: Remove redundant assignments Some tests have the drm pointer assigned multiple times to the same value. Drop the redundant assignments. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-2-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index 8e6eb94075a5e..a36422aa9e274 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -481,7 +481,6 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode_vic_1(struct kunit *test) mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -595,7 +594,6 @@ static void drm_test_check_broadcast_rgb_full_cea_mode_vic_1(struct kunit *test) mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -711,7 +709,6 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode_vic_1(struct kunit *te mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1313,7 +1310,6 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) rate = mode->clock * 1500; KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_EXPECT_EQ(test, ret, 0); -- GitLab From 6b6bfd63e1626ceedc738b2a06505aa5b46c1481 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:55 +0100 Subject: [PATCH 292/989] drm/tests: hdmi: Reorder DRM entities variables assignment The tests all deviate slightly in how they assign their local pointers to DRM entities. This makes refactoring pretty difficult, so let's just move the assignment as soon as the entities are allocated. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-3-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- .../drm/tests/drm_hdmi_state_helper_test.c | 81 ++++++++++--------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index a36422aa9e274..925724b578789 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -289,15 +289,16 @@ static void drm_test_check_broadcast_rgb_crtc_mode_changed(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -352,15 +353,16 @@ static void drm_test_check_broadcast_rgb_crtc_mode_not_changed(struct kunit *tes 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -415,6 +417,8 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -425,8 +429,6 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -526,6 +528,8 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -536,8 +540,6 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -641,6 +643,8 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); @@ -651,8 +655,6 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -758,6 +760,8 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -770,8 +774,6 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -832,6 +834,8 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -844,8 +848,6 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -903,6 +905,8 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_dvi_1080p, @@ -918,8 +922,6 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -950,6 +952,8 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_200mhz, @@ -963,8 +967,6 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -997,6 +999,8 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) 10); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1010,8 +1014,6 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1044,6 +1046,8 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1057,8 +1061,6 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1095,15 +1097,16 @@ static void drm_test_check_hdmi_funcs_reject_rate(struct kunit *test) 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; + conn = &priv->connector; + ctx = drm_kunit_helper_acquire_ctx_alloc(test); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - conn = &priv->connector; preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1151,6 +1154,8 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1174,8 +1179,6 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 10, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1220,6 +1223,8 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1246,8 +1251,6 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1343,6 +1346,8 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz, @@ -1374,8 +1379,6 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1410,6 +1413,8 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_200mhz, @@ -1441,8 +1446,6 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1476,6 +1479,8 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test 8); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz, @@ -1499,8 +1504,6 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1536,6 +1539,8 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes 12); KUNIT_ASSERT_NOT_NULL(test, priv); + drm = &priv->drm; + crtc = priv->crtc; conn = &priv->connector; ret = set_connector_edid(test, conn, test_edid_hdmi_1080p_rgb_max_340mhz, @@ -1559,8 +1564,6 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); - drm = &priv->drm; - crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); -- GitLab From 5d14c08a47460e8eedf0185a28b116420ea7f29d Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 29 Jan 2025 15:21:56 +0100 Subject: [PATCH 293/989] drm/tests: hdmi: Fix recursive locking The find_preferred_mode() functions takes the mode_config mutex, but due to the order most tests have, is called with the crtc_ww_class_mutex taken. This raises a warning for a circular dependency when running the tests with lockdep. Reorder the tests to call find_preferred_mode before the acquire context has been created to avoid the issue. Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250129-test-kunit-v2-4-fe59c43805d5@kernel.org Signed-off-by: Maxime Ripard --- .../drm/tests/drm_hdmi_state_helper_test.c | 114 +++++++++--------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c index 925724b578789..23ecc00accb21 100644 --- a/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c +++ b/drivers/gpu/drm/tests/drm_hdmi_state_helper_test.c @@ -293,12 +293,12 @@ static void drm_test_check_broadcast_rgb_crtc_mode_changed(struct kunit *test) crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -357,12 +357,12 @@ static void drm_test_check_broadcast_rgb_crtc_mode_not_changed(struct kunit *tes crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -422,13 +422,13 @@ static void drm_test_check_broadcast_rgb_auto_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -533,13 +533,13 @@ static void drm_test_check_broadcast_rgb_full_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -648,13 +648,13 @@ static void drm_test_check_broadcast_rgb_limited_cea_mode(struct kunit *test) conn = &priv->connector; KUNIT_ASSERT_TRUE(test, conn->display_info.is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_NE(test, drm_match_cea_mode(preferred), 1); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -768,12 +768,12 @@ static void drm_test_check_output_bpc_crtc_mode_changed(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -842,12 +842,12 @@ static void drm_test_check_output_bpc_crtc_mode_not_changed(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -916,12 +916,12 @@ static void drm_test_check_output_bpc_dvi(struct kunit *test) info = &conn->display_info; KUNIT_ASSERT_FALSE(test, info->is_hdmi); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -960,13 +960,13 @@ static void drm_test_check_tmds_char_rate_rgb_8bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_max_200mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1007,13 +1007,13 @@ static void drm_test_check_tmds_char_rate_rgb_10bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1054,13 +1054,13 @@ static void drm_test_check_tmds_char_rate_rgb_12bpc(struct kunit *test) ARRAY_SIZE(test_edid_hdmi_1080p_rgb_yuv_dc_max_340mhz)); KUNIT_ASSERT_GT(test, ret, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1101,12 +1101,12 @@ static void drm_test_check_hdmi_funcs_reject_rate(struct kunit *test) crtc = priv->crtc; conn = &priv->connector; - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_ASSERT_EQ(test, ret, 0); @@ -1166,9 +1166,6 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); @@ -1179,6 +1176,9 @@ static void drm_test_check_max_tmds_rate_bpc_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 10, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1235,9 +1235,6 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); KUNIT_ASSERT_FALSE(test, preferred->flags & DRM_MODE_FLAG_DBLCLK); @@ -1251,6 +1248,9 @@ static void drm_test_check_max_tmds_rate_format_fallback(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1297,9 +1297,6 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - mode = drm_kunit_display_mode_from_cea_vic(test, drm, 1); KUNIT_ASSERT_NOT_NULL(test, mode); @@ -1313,6 +1310,9 @@ static void drm_test_check_output_bpc_format_vic_1(struct kunit *test) rate = mode->clock * 1500; KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + crtc = priv->crtc; ret = light_up_connector(test, drm, crtc, conn, mode, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1358,9 +1358,6 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1379,6 +1376,9 @@ static void drm_test_check_output_bpc_format_driver_rgb_only(struct kunit *test) rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1425,9 +1425,6 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1446,6 +1443,9 @@ static void drm_test_check_output_bpc_format_display_rgb_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_YUV422); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1491,9 +1491,6 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1504,6 +1501,9 @@ static void drm_test_check_output_bpc_format_driver_8bpc_only(struct kunit *test rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); @@ -1551,9 +1551,6 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes KUNIT_ASSERT_TRUE(test, info->is_hdmi); KUNIT_ASSERT_GT(test, info->max_tmds_clock, 0); - ctx = drm_kunit_helper_acquire_ctx_alloc(test); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); - preferred = find_preferred_mode(conn); KUNIT_ASSERT_NOT_NULL(test, preferred); @@ -1564,6 +1561,9 @@ static void drm_test_check_output_bpc_format_display_8bpc_only(struct kunit *tes rate = drm_hdmi_compute_mode_clock(preferred, 12, HDMI_COLORSPACE_RGB); KUNIT_ASSERT_LT(test, rate, info->max_tmds_clock * 1000); + ctx = drm_kunit_helper_acquire_ctx_alloc(test); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + ret = light_up_connector(test, drm, crtc, conn, preferred, ctx); KUNIT_EXPECT_EQ(test, ret, 0); -- GitLab From b029628be267cba3c7684ec684749fe3e4372398 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 12 Jan 2025 23:39:01 -0600 Subject: [PATCH 294/989] alpha/elf: Fix misc/setarch test of util-linux by removing 32bit support Richard Henderson writes[1]: > There was a Spec benchmark (I forget which) which was memory bound and ran > twice as fast with 32-bit pointers. > > I copied the idea from DEC to the ELF abi, but never did all the other work > to allow the toolchain to take advantage. > > Amusingly, a later Spec changed the benchmark data sets to not fit into a > 32-bit address space, specifically because of this. > > I expect one could delete the ELF bit and personality and no one would > notice. Not even the 10 remaining Alpha users. In [2] it was pointed out that parts of setarch weren't working properly on alpha because it has it's own SET_PERSONALITY implementation. In the discussion that followed Richard Henderson pointed out that the 32bit pointer support for alpha was never completed. Fix this by removing alpha's 32bit pointer support. As a bit of paranoia refuse to execute any alpha binaries that have the EF_ALPHA_32BIT flag set. Just in case someone somewhere has binaries that try to use alpha's 32bit pointer support. Link: https://lkml.kernel.org/r/CAFXwXrkgu=4Qn-v1PjnOR4SG0oUb9LSa0g6QXpBq4ttm52pJOQ@mail.gmail.com [1] Link: https://lkml.kernel.org/r/20250103140148.370368-1-glaubitz@physik.fu-berlin.de [2] Signed-off-by: Eric W. Biederman Reviewed-by: Richard Henderson Reviewed-by: Arnd Bergmann Reviewed-by: John Paul Adrian Glaubitz Tested-by: John Paul Adrian Glaubitz Link: https://lore.kernel.org/r/87y0zfs26i.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: Kees Cook --- arch/alpha/include/asm/elf.h | 6 +----- arch/alpha/include/asm/pgtable.h | 2 +- arch/alpha/include/asm/processor.h | 8 ++------ arch/alpha/kernel/osf_sys.c | 11 ++--------- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h index 4d7c46f50382e..50c82187e60ec 100644 --- a/arch/alpha/include/asm/elf.h +++ b/arch/alpha/include/asm/elf.h @@ -74,7 +74,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; /* * This is used to ensure we don't load something for the wrong architecture. */ -#define elf_check_arch(x) ((x)->e_machine == EM_ALPHA) +#define elf_check_arch(x) (((x)->e_machine == EM_ALPHA) && !((x)->e_flags & EF_ALPHA_32BIT)) /* * These are used to set parameters in the core dumps. @@ -137,10 +137,6 @@ extern int dump_elf_task(elf_greg_t *dest, struct task_struct *task); : amask (AMASK_CIX) ? "ev6" : "ev67"); \ }) -#define SET_PERSONALITY(EX) \ - set_personality(((EX).e_flags & EF_ALPHA_32BIT) \ - ? PER_LINUX_32BIT : PER_LINUX) - extern int alpha_l1i_cacheshape; extern int alpha_l1d_cacheshape; extern int alpha_l2_cacheshape; diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 635f0a5f5bbde..02e8817a89212 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -360,7 +360,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void paging_init(void); -/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ +/* We have our own get_unmapped_area */ #define HAVE_ARCH_UNMAPPED_AREA #endif /* _ALPHA_PGTABLE_H */ diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 55bb1c09fd39d..5dce5518a2111 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -8,23 +8,19 @@ #ifndef __ASM_ALPHA_PROCESSOR_H #define __ASM_ALPHA_PROCESSOR_H -#include /* for ADDR_LIMIT_32BIT */ - /* * We have a 42-bit user address space: 4TB user VM... */ #define TASK_SIZE (0x40000000000UL) -#define STACK_TOP \ - (current->personality & ADDR_LIMIT_32BIT ? 0x80000000 : 0x00120000000UL) +#define STACK_TOP (0x00120000000UL) #define STACK_TOP_MAX 0x00120000000UL /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE \ - ((current->personality & ADDR_LIMIT_32BIT) ? 0x40000000 : TASK_SIZE / 2) +#define TASK_UNMAPPED_BASE (TASK_SIZE / 2) /* This is dead. Everything has been moved to thread_info. */ struct thread_struct { }; diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index c0424de9e7cda..077a1407be6d7 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1211,8 +1211,7 @@ SYSCALL_DEFINE1(old_adjtimex, struct timex32 __user *, txc_p) return ret; } -/* Get an address range which is currently unmapped. Similar to the - generic version except that we know how to honor ADDR_LIMIT_32BIT. */ +/* Get an address range which is currently unmapped. */ static unsigned long arch_get_unmapped_area_1(unsigned long addr, unsigned long len, @@ -1231,13 +1230,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - unsigned long limit; - - /* "32 bit" actually means 31 bit, since pointers sign extend. */ - if (current->personality & ADDR_LIMIT_32BIT) - limit = 0x80000000; - else - limit = TASK_SIZE; + unsigned long limit = TASK_SIZE; if (len > limit) return -ENOMEM; -- GitLab From 7507eb3e7bfac7c3baef8dd377fdf5871eefd42b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 31 Jan 2025 17:29:13 +0200 Subject: [PATCH 295/989] PCI/ASPM: Fix L1SS saving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1db806ec06b7 ("PCI/ASPM: Save parent L1SS config in pci_save_aspm_l1ss_state()") aimed to perform L1SS config save for both the Upstream Port and its upstream bridge when handling an Upstream Port, which matches what the L1SS restore side does. However, parent->state_saved can be set true at an earlier time when the upstream bridge saved other parts of its state. Then later when attempting to save the L1SS config while handling the Upstream Port, parent->state_saved is true in pci_save_aspm_l1ss_state() resulting in early return and skipping saving bridge's L1SS config because it is assumed to be already saved. Later on restore, junk is written into L1SS config which causes issues with some devices. Remove parent->state_saved check and unconditionally save L1SS config also for the upstream bridge from an Upstream Port which ought to be harmless from correctness point of view. With the Upstream Port check now present, saving the L1SS config more than once for the bridge is no longer a problem (unlike when the parent->state_saved check got introduced into the fix during its development). Link: https://lore.kernel.org/r/20250131152913.2507-1-ilpo.jarvinen@linux.intel.com Fixes: 1db806ec06b7 ("PCI/ASPM: Save parent L1SS config in pci_save_aspm_l1ss_state()") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219731 Reported-by: Niklāvs Koļesņikovs Reported by: Rafael J. Wysocki Closes: https://lore.kernel.org/r/CAJZ5v0iKmynOQ5vKSQbg1J_FmavwZE-nRONovOZ0mpMVauheWg@mail.gmail.com Reported-by: Paul Menzel Closes: https://lore.kernel.org/r/d7246feb-4f3f-4d0c-bb64-89566b170671@molgen.mpg.de Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Niklāvs Koļesņikovs Tested-by: Paul Menzel # Dell XPS 13 9360 --- drivers/pci/pcie/aspm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index e0bc90597dcad..da3e7edcf49d9 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -108,9 +108,6 @@ void pci_save_aspm_l1ss_state(struct pci_dev *pdev) pci_read_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL2, cap++); pci_read_config_dword(pdev, pdev->l1ss + PCI_L1SS_CTL1, cap++); - if (parent->state_saved) - return; - /* * Save parent's L1 substate configuration so we have it for * pci_restore_aspm_l1ss_state(pdev) to restore. -- GitLab From 57e4a9bd61c308f607bc3e55e8fa02257b06b552 Mon Sep 17 00:00:00 2001 From: Meetakshi Setiya Date: Thu, 6 Feb 2025 01:50:41 -0500 Subject: [PATCH 296/989] smb: client: change lease epoch type from unsigned int to __u16 MS-SMB2 section 2.2.13.2.10 specifies that 'epoch' should be a 16-bit unsigned integer used to track lease state changes. Change the data type of all instances of 'epoch' from unsigned int to __u16. This simplifies the epoch change comparisons and makes the code more compliant with the protocol spec. Cc: stable@vger.kernel.org Signed-off-by: Meetakshi Setiya Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 14 +++++++------- fs/smb/client/smb1ops.c | 2 +- fs/smb/client/smb2ops.c | 18 +++++++++--------- fs/smb/client/smb2pdu.c | 2 +- fs/smb/client/smb2proto.h | 2 +- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index a68434ad744ae..ac1f890a0d543 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -357,7 +357,7 @@ struct smb_version_operations { int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); /* process transaction2 response */ bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *, char *, int); @@ -552,12 +552,12 @@ struct smb_version_operations { /* if we can do cache read operations */ bool (*is_read_op)(__u32); /* set oplock level for the inode */ - void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, - bool *); + void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, + bool *purge_cache); /* create lease context buffer for CREATE request */ char * (*create_lease_buf)(u8 *lease_key, u8 oplock); /* parse lease context buffer and return oplock/epoch info */ - __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey); + __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, @@ -1447,7 +1447,7 @@ struct cifs_fid { __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; - unsigned int epoch; + __u16 epoch; #ifdef CONFIG_CIFS_DEBUG2 __u64 mid; #endif /* CIFS_DEBUG2 */ @@ -1480,7 +1480,7 @@ struct cifsFileInfo { bool oplock_break_cancelled:1; bool status_file_deleted:1; /* file has been deleted */ bool offload:1; /* offload final part of _put to a wq */ - unsigned int oplock_epoch; /* epoch from the lease break */ + __u16 oplock_epoch; /* epoch from the lease break */ __u32 oplock_level; /* oplock/lease level from the lease break */ int count; spinlock_t file_info_lock; /* protects four flag/count fields above */ @@ -1577,7 +1577,7 @@ struct cifsInodeInfo { spinlock_t open_file_lock; /* protects openFileList */ __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned int oplock; /* oplock/lease level we have */ - unsigned int epoch; /* used to track lease state changes */ + __u16 epoch; /* used to track lease state changes */ #define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */ #define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */ #define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */ diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 9756b876a75e1..d6e2fb669c401 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -377,7 +377,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) static void cifs_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { cifs_set_oplock_level(cinode, oplock); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 77309217dab45..ec36bed54b0b9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -3904,22 +3904,22 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, static void smb2_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { server->ops->set_oplock_level(cinode, oplock, 0, NULL); } static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache); + __u16 epoch, bool *purge_cache); static void smb3_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_state = cinode->oplock; - unsigned int old_epoch = cinode->epoch; + __u16 old_epoch = cinode->epoch; unsigned int new_state; if (epoch > old_epoch) { @@ -3939,7 +3939,7 @@ smb3_downgrade_oplock(struct TCP_Server_Info *server, static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { oplock &= 0xFF; cinode->lease_granted = false; @@ -3963,7 +3963,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { char message[5] = {0}; unsigned int new_oplock = 0; @@ -4000,7 +4000,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, static void smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, - unsigned int epoch, bool *purge_cache) + __u16 epoch, bool *purge_cache) { unsigned int old_oplock = cinode->oplock; @@ -4114,7 +4114,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) } static __u8 -smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb2_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease *lc = (struct create_lease *)buf; @@ -4125,7 +4125,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) } static __u8 -smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key) +smb3_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key) { struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 78395195e0165..ed7812247ebc0 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2329,7 +2329,7 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info, int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix) diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 2336dfb23f363..4662c7e2d259c 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -283,7 +283,7 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); int smb2_parse_contexts(struct TCP_Server_Info *server, struct kvec *rsp_iov, - unsigned int *epoch, + __u16 *epoch, char *lease_key, __u8 *oplock, struct smb2_file_all_info *buf, struct create_posix_rsp *posix); -- GitLab From a9c621a217128eb3fb7522cf763992d9437fd5ba Mon Sep 17 00:00:00 2001 From: "Justin M. Forbes" Date: Wed, 29 Jan 2025 14:50:02 -0700 Subject: [PATCH 297/989] rust: kbuild: add -fzero-init-padding-bits to bindgen_skip_cflags This seems to break the build when building with gcc15: Unable to generate bindings: ClangDiagnostic("error: unknown argument: '-fzero-init-padding-bits=all'\n") Thus skip that flag. Signed-off-by: Justin M. Forbes Fixes: dce4aab8441d ("kbuild: Use -fzero-init-padding-bits=all") Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20250129215003.1736127-1-jforbes@fedoraproject.org [ Slightly reworded commit. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/Makefile b/rust/Makefile index ff4343ca3f7c4..ff8a5e810d65e 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -240,6 +240,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ -fzero-call-used-regs=% -fno-stack-clash-protection \ -fno-inline-functions-called-once -fsanitize=bounds-strict \ -fstrict-flex-arrays=% -fmin-function-alignment=% \ + -fzero-init-padding-bits=% \ --param=% --param asan-% # Derived from `scripts/Makefile.clang`. -- GitLab From 6f64b83d9fe9729000a0616830cb1606945465d8 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 5 Feb 2025 12:52:13 +0000 Subject: [PATCH 298/989] PCI/TPH: Restore TPH Requester Enable correctly When we reenable TPH after changing a Steering Tag value, we need the actual TPH Requester Enable value, not the ST Mode (which only happens to work out by chance for non-extended TPH in interrupt vector mode). Link: https://lore.kernel.org/r/13118098116d7bce07aa20b8c52e28c7d1847246.1738759933.git.robin.murphy@arm.com Fixes: d2e8a34876ce ("PCI/TPH: Add Steering Tag support") Signed-off-by: Robin Murphy Signed-off-by: Bjorn Helgaas Reviewed-by: Wei Huang --- drivers/pci/tph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 1e604fbbda657..07de59ca2ebfa 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -360,7 +360,7 @@ int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) return err; } - set_ctrl_reg_req_en(pdev, pdev->tph_mode); + set_ctrl_reg_req_en(pdev, pdev->tph_req_type); pci_dbg(pdev, "set steering tag: %s table, index=%d, tag=%#04x\n", (loc == PCI_TPH_LOC_MSIX) ? "MSI-X" : "ST", index, tag); -- GitLab From 0e446e3145011b8fe39759b59bd69d39fb47cfeb Mon Sep 17 00:00:00 2001 From: Matthew Maurer Date: Wed, 22 Jan 2025 00:14:43 +0000 Subject: [PATCH 299/989] rust: kbuild: do not export generated KASAN ODR symbols ASAN generates special synthetic symbols to help check for ODR violations. These synthetic symbols lack debug information, so gendwarfksyms emits warnings when processing them. No code should ever have a dependency on these symbols, so we should not be exporting them, just like the __cfi symbols. Signed-off-by: Matthew Maurer Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250122-gendwarfksyms-kasan-rust-v1-1-5ee5658f4fb6@google.com [ Fixed typo in commit message. Slightly reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index ff8a5e810d65e..ea3849eb78f65 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -332,7 +332,7 @@ $(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_extra = ; $(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers/helpers.c FORCE $(call if_changed_dep,bindgen) -rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ { printf $(2),$$3 }' +rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ && $$3!~/__odr_asan/ { printf $(2),$$3 }' quiet_cmd_exports = EXPORTS $@ cmd_exports = \ -- GitLab From 3ace20038e19f23fe73259513f1f08d4bf1a3c83 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:20 +0000 Subject: [PATCH 300/989] cpufreq/amd-pstate: Fix cpufreq_policy ref counting amd_pstate_update_limits() takes a cpufreq_policy reference but doesn't decrement the refcount in one of the exit paths, fix that. Fixes: 45722e777fd9 ("cpufreq: amd-pstate: Optimize amd_pstate_update_limits()") Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250205112523.201101-10-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 9dc3933bc3261..313550fa62d41 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -821,20 +821,21 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) static void amd_pstate_update_limits(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata; u32 prev_high = 0, cur_high = 0; int ret; bool highest_perf_changed = false; + if (!amd_pstate_prefcore) + return; + + policy = cpufreq_cpu_get(cpu); if (!policy) return; cpudata = policy->driver_data; - if (!amd_pstate_prefcore) - return; - guard(mutex)(&amd_pstate_driver_lock); ret = amd_get_highest_perf(cpu, &cur_high); -- GitLab From bb5408801a5f2ecd76b61dcd539a5c466ebaac4c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 4 Feb 2025 09:45:13 -0800 Subject: [PATCH 301/989] stackinit: Keep selftest union size small on m68k The stack frame on m68k is very sensitive to the size of what needs to be stored. Like done for long string testing, reduce the size of the large trailing struct in the union initialization testing. Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/all/CAMuHMdXW8VbtOAixO7w+aDOG70aZtZ50j1Ybcr8B3eYnRUcrcA@mail.gmail.com Fixes: e71a29db79da ("stackinit: Add union initialization to selftests") Link: https://lore.kernel.org/r/20250204174509.work.711-kees@kernel.org Signed-off-by: Kees Cook Tested-by: Geert Uytterhoeven --- lib/stackinit_kunit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index fbe910c9c8253..967b345a98fd8 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -75,8 +75,10 @@ static bool stackinit_range_contains(char *haystack_start, size_t haystack_size, */ #ifdef CONFIG_M68K #define FILL_SIZE_STRING 8 +#define FILL_SIZE_ARRAY 2 #else #define FILL_SIZE_STRING 16 +#define FILL_SIZE_ARRAY 8 #endif #define INIT_CLONE_SCALAR /**/ @@ -345,7 +347,7 @@ union test_small_start { short three; unsigned long four; struct big_struct { - unsigned long array[8]; + unsigned long array[FILL_SIZE_ARRAY]; } big; }; -- GitLab From 78bba6097b9318f4aa645afeade14024af86af4e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 3 Feb 2025 15:34:07 +0100 Subject: [PATCH 302/989] stackinit: Fix comment for test_small_end In union test_small_end, the small members are three and four. Fixes: e71a29db79da1946 ("stackinit: Add union initialization to selftests") Closes: https://lore.kernel.org/CAMuHMdWvcKOc6v5o3-9-SqP_4oh5-GZQjZZb=-krhY=mVRED_Q@mail.gmail.com Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/3f8faa2d7d0d6b36571093ab0fb1fd5157abd7bb.1738593178.git.geert+renesas@glider.be Signed-off-by: Kees Cook --- lib/stackinit_kunit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index 967b345a98fd8..135322592faf8 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -351,7 +351,7 @@ union test_small_start { } big; }; -/* Mismatched sizes, with one and two being small */ +/* Mismatched sizes, with three and four being small */ union test_small_end { short one; unsigned long two; -- GitLab From 0db6b7d49b50c037b5eac19a8d8d1da986db80c6 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Mon, 3 Feb 2025 20:04:44 +0200 Subject: [PATCH 303/989] MAINTAINERS: wifi: ath: remove Kalle I'm stepping down as ath10k, ath11k and ath12k maintainer so remove me from MAINTAINERS file and Device Tree bindings. Jeff continues as the maintainer. As my quicinc.com email will not work anymore so add an entry to .mailmap file to direct the mail to my kernel.org address. Signed-off-by: Kalle Valo Acked-by: Jeff Johnson Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20250203180445.1429640-1-kvalo@kernel.org --- .mailmap | 1 + .../devicetree/bindings/net/wireless/qcom,ath10k.yaml | 1 - .../devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml | 1 - .../devicetree/bindings/net/wireless/qcom,ath11k.yaml | 1 - .../devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml | 1 - .../devicetree/bindings/net/wireless/qcom,ath12k.yaml | 1 - MAINTAINERS | 4 ---- 7 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.mailmap b/.mailmap index 42e42cabb36d7..68d36bf46dcb4 100644 --- a/.mailmap +++ b/.mailmap @@ -370,6 +370,7 @@ Juha Yrjola Julien Thierry Iskren Chernev Kalle Valo +Kalle Valo Kalyan Thota Karthikeyan Periyasamy Kathiravan T diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml index 070c4c9b86437..aace072e2d52a 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath10k.yaml @@ -7,7 +7,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath10k wireless devices maintainers: - - Kalle Valo - Jeff Johnson description: diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml index a71fdf05bc1ea..a4425cf196aba 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k-pci.yaml @@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath11k wireless devices (PCIe) maintainers: - - Kalle Valo - Jeff Johnson description: | diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml index ff5763dc66a88..a69ffb7b3cb88 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml @@ -8,7 +8,6 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm Technologies ath11k wireless devices maintainers: - - Kalle Valo - Jeff Johnson description: | diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml index cbfb559f6b69b..318f305405e3b 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k-wsi.yaml @@ -9,7 +9,6 @@ title: Qualcomm Technologies ath12k wireless devices (PCIe) with WSI interface maintainers: - Jeff Johnson - - Kalle Valo description: | Qualcomm Technologies IEEE 802.11be PCIe devices with WSI interface. diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml index 1b5884015b15b..9e557cb838c7a 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath12k.yaml @@ -9,7 +9,6 @@ title: Qualcomm Technologies ath12k wireless devices (PCIe) maintainers: - Jeff Johnson - - Kalle Valo description: Qualcomm Technologies IEEE 802.11be PCIe devices. diff --git a/MAINTAINERS b/MAINTAINERS index 5bcc78c0be70b..2048c75c3c428 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3631,7 +3631,6 @@ F: Documentation/devicetree/bindings/phy/phy-ath79-usb.txt F: drivers/phy/qualcomm/phy-ath79-usb.c ATHEROS ATH GENERIC UTILITIES -M: Kalle Valo M: Jeff Johnson L: linux-wireless@vger.kernel.org S: Supported @@ -19192,7 +19191,6 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/tuners/qt1010* QUALCOMM ATH12K WIRELESS DRIVER -M: Kalle Valo M: Jeff Johnson L: ath12k@lists.infradead.org S: Supported @@ -19202,7 +19200,6 @@ F: drivers/net/wireless/ath/ath12k/ N: ath12k QUALCOMM ATHEROS ATH10K WIRELESS DRIVER -M: Kalle Valo M: Jeff Johnson L: ath10k@lists.infradead.org S: Supported @@ -19212,7 +19209,6 @@ F: drivers/net/wireless/ath/ath10k/ N: ath10k QUALCOMM ATHEROS ATH11K WIRELESS DRIVER -M: Kalle Valo M: Jeff Johnson L: ath11k@lists.infradead.org S: Supported -- GitLab From b76adb9758f8eaaf22b824d0bcdd694551ce0557 Mon Sep 17 00:00:00 2001 From: Kalle Valo Date: Mon, 3 Feb 2025 20:04:45 +0200 Subject: [PATCH 304/989] MAINTAINERS: wifi: remove Kalle I'm stepping down as wireless driver maintainer. Johannes kindly voluntereed to be the "custodian"[1] for the drivers until a better solution is found. Link: https://lore.kernel.org/all/21896d2788b8bc6c7fcb534cd43e75671a57f494.camel@sipsolutions.net/ [1] Signed-off-by: Kalle Valo Link: https://patch.msgid.link/20250203180445.1429640-2-kvalo@kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2048c75c3c428..6401e59ec2cc0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16341,7 +16341,7 @@ X: drivers/net/can/ X: drivers/net/wireless/ NETWORKING DRIVERS (WIRELESS) -M: Kalle Valo +M: Johannes Berg L: linux-wireless@vger.kernel.org S: Maintained W: https://wireless.wiki.kernel.org/ -- GitLab From cf6cb56ef24410fb5308f9655087f1eddf4452e6 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 2 Feb 2025 08:29:20 -0800 Subject: [PATCH 305/989] seccomp: passthrough uretprobe systemcall without filtering When attaching uretprobes to processes running inside docker, the attached process is segfaulted when encountering the retprobe. The reason is that now that uretprobe is a system call the default seccomp filters in docker block it as they only allow a specific set of known syscalls. This is true for other userspace applications which use seccomp to control their syscall surface. Since uretprobe is a "kernel implementation detail" system call which is not used by userspace application code directly, it is impractical and there's very little point in forcing all userspace applications to explicitly allow it in order to avoid crashing tracked processes. Pass this systemcall through seccomp without depending on configuration. Note: uretprobe is currently only x86_64 and isn't expected to ever be supported in i386. Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe") Reported-by: Rafael Buchbinder Closes: https://lore.kernel.org/lkml/CAHsH6Gs3Eh8DFU0wq58c_LF8A4_+o6z456J7BidmcVY2AqOnHQ@mail.gmail.com/ Link: https://lore.kernel.org/lkml/20250121182939.33d05470@gandalf.local.home/T/#me2676c378eff2d6a33f3054fed4a5f3afa64e65b Link: https://lore.kernel.org/lkml/20250128145806.1849977-1-eyal.birger@gmail.com/ Cc: stable@vger.kernel.org Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20250202162921.335813-2-eyal.birger@gmail.com [kees: minimized changes for easier backporting, tweaked commit log] Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f59381c4a2ffb..7bbb408431ebc 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, if (WARN_ON_ONCE(!fprog)) return false; + /* Our single exception to filtering. */ +#ifdef __NR_uretprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + if (sd->nr == __NR_uretprobe) + return true; +#endif + for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; @@ -1023,6 +1032,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, +#ifdef __NR_uretprobe + __NR_uretprobe, +#endif -1, /* negative terminated */ }; -- GitLab From c2debdb8544f415eaf9292a866d4073912eeb561 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sun, 2 Feb 2025 08:29:21 -0800 Subject: [PATCH 306/989] selftests/seccomp: validate uretprobe syscall passes through seccomp The uretprobe syscall is implemented as a performance enhancement on x86_64 by having the kernel inject a call to it on function exit; User programs cannot call this system call explicitly. As such, this syscall is considered a kernel implementation detail and should not be filtered by seccomp. Enhance the seccomp bpf test suite to check that uretprobes can be attached to processes without the killing the process regardless of seccomp policy. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20250202162921.335813-3-eyal.birger@gmail.com [kees: Skip archs without __NR_uretprobe] Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 8c3a73461475b..14ba51b52095a 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,10 @@ # define PR_SET_PTRACER 0x59616d61 #endif +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4888,6 +4893,200 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } +noinline int probed(void) +{ + return 1; +} + +static int parse_uint_from_file(const char *file, const char *fmt) +{ + int err = -1, ret; + FILE *f; + + f = fopen(file, "re"); + if (f) { + err = fscanf(f, fmt, &ret); + fclose(f); + } + return err == 1 ? ret : err; +} + +static int determine_uprobe_perf_type(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/type"; + + return parse_uint_from_file(file, "%d\n"); +} + +static int determine_uprobe_retprobe_bit(void) +{ + const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe"; + + return parse_uint_from_file(file, "config:%d\n"); +} + +static ssize_t get_uprobe_offset(const void *addr) +{ + size_t start, base, end; + bool found = false; + char buf[256]; + FILE *f; + + f = fopen("/proc/self/maps", "r"); + if (!f) + return -1; + + while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { + if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { + found = true; + break; + } + } + fclose(f); + return found ? (uintptr_t)addr - start + base : -1; +} + +FIXTURE(URETPROBE) { + int fd; +}; + +FIXTURE_VARIANT(URETPROBE) { + /* + * All of the URETPROBE behaviors can be tested with either + * uretprobe attached or not + */ + bool attach; +}; + +FIXTURE_VARIANT_ADD(URETPROBE, attached) { + .attach = true, +}; + +FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { + .attach = false, +}; + +FIXTURE_SETUP(URETPROBE) +{ + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; + ssize_t offset; + int type, bit; + +#ifndef __NR_uretprobe + SKIP(return, "__NR_uretprobe syscall not defined"); +#endif + + if (!variant->attach) + return; + + memset(&attr, 0, attr_sz); + + type = determine_uprobe_perf_type(); + ASSERT_GE(type, 0); + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + offset = get_uprobe_offset(probed); + ASSERT_GE(offset, 0); + + attr.config |= 1 << bit; + attr.size = attr_sz; + attr.type = type; + attr.config1 = ptr_to_u64("/proc/self/exe"); + attr.config2 = offset; + + self->fd = syscall(__NR_perf_event_open, &attr, + getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */, + PERF_FLAG_FD_CLOEXEC); +} + +FIXTURE_TEARDOWN(URETPROBE) +{ + /* we could call close(self->fd), but we'd need extra filter for + * that and since we are calling _exit right away.. + */ +} + +static int run_probed_with_filter(struct sock_fprog *prog) +{ + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || + seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) { + return -1; + } + + probed(); + return 0; +} + +TEST_F(URETPROBE, uretprobe_default_allow) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_default_block) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), +#ifdef __NR_uretprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), +#endif + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + +TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), +#ifdef __NR_uretprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), +#endif + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + ASSERT_EQ(0, run_probed_with_filter(&prog)); +} + /* * TODO: * - expand NNP testing -- GitLab From 6273a058383e05465083b535ed9469f2c8a48321 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Mon, 3 Feb 2025 08:40:57 +0000 Subject: [PATCH 307/989] x86: rust: set rustc-abi=x86-softfloat on rustc>=1.86.0 When using Rust on the x86 architecture, we are currently using the unstable target.json feature to specify the compilation target. Rustc is going to change how softfloat is specified in the target.json file on x86, thus update generate_rust_target.rs to specify softfloat using the new option. Note that if you enable this parameter with a compiler that does not recognize it, then that triggers a warning but it does not break the build. [ For future reference, this solves the following error: RUSTC L rust/core.o error: Error loading target specification: target feature `soft-float` is incompatible with the ABI but gets enabled in target spec. Run `rustc --print target-list` for a list of built-in targets - Miguel ] Cc: # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Link: https://github.com/rust-lang/rust/pull/136146 Signed-off-by: Alice Ryhl Acked-by: Dave Hansen # for x86 Link: https://lore.kernel.org/r/20250203-rustc-1-86-x86-softfloat-v1-1-220a72a5003e@google.com [ Added 6.13.y too to Cc: stable tag and added reasoning to avoid over-backporting. - Miguel ] Signed-off-by: Miguel Ojeda --- scripts/generate_rust_target.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs index 0d00ac3723b5e..4fd6b6ab3e329 100644 --- a/scripts/generate_rust_target.rs +++ b/scripts/generate_rust_target.rs @@ -165,6 +165,18 @@ fn has(&self, option: &str) -> bool { let option = "CONFIG_".to_owned() + option; self.0.contains_key(&option) } + + /// Is the rustc version at least `major.minor.patch`? + fn rustc_version_atleast(&self, major: u32, minor: u32, patch: u32) -> bool { + let check_version = 100000 * major + 100 * minor + patch; + let actual_version = self + .0 + .get("CONFIG_RUSTC_VERSION") + .unwrap() + .parse::() + .unwrap(); + check_version <= actual_version + } } fn main() { @@ -182,6 +194,9 @@ fn main() { } } else if cfg.has("X86_64") { ts.push("arch", "x86_64"); + if cfg.rustc_version_atleast(1, 86, 0) { + ts.push("rustc-abi", "x86-softfloat"); + } ts.push( "data-layout", "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", @@ -215,6 +230,9 @@ fn main() { panic!("32-bit x86 only works under UML"); } ts.push("arch", "x86"); + if cfg.rustc_version_atleast(1, 86, 0) { + ts.push("rustc-abi", "x86-softfloat"); + } ts.push( "data-layout", "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128", -- GitLab From 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:09 +0000 Subject: [PATCH 308/989] net: add dev_net_rcu() helper dev->nd_net can change, readers should either use rcu_read_lock() or RTNL. We currently use a generic helper, dev_net() with no debugging support. We probably have many hidden bugs. Add dev_net_rcu() helper for callers using rcu_read_lock() protection. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ include/net/net_namespace.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03bb584c62cf8..c0a86afb85daa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev) return read_pnet(&dev->nd_net); } +static inline +struct net *dev_net_rcu(const struct net_device *dev) +{ + return read_pnet_rcu(&dev->nd_net); +} + static inline void dev_net_set(struct net_device *dev, struct net *net) { diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 0f5eb9db0c626..7ba1402ca7796 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } -static inline struct net *read_pnet_rcu(possible_net_t *pnet) +static inline struct net *read_pnet_rcu(const possible_net_t *pnet) { #ifdef CONFIG_NET_NS return rcu_dereference(pnet->net); -- GitLab From 469308552ca4560176cfc100e7ca84add1bebd7c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:10 +0000 Subject: [PATCH 309/989] ipv4: add RCU protection to ip4_dst_hoplimit() ip4_dst_hoplimit() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index f86775be3e293..c605fd5ec0c08 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb) static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); - struct net *net = dev_net(dst->dev); - if (hoplimit == 0) + if (hoplimit == 0) { + const struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); + rcu_read_unlock(); + } return hoplimit; } -- GitLab From 071d8012869b6af352acca346ade13e7be90a49f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:11 +0000 Subject: [PATCH 310/989] ipv4: use RCU protection in ip_dst_mtu_maybe_forward() ip_dst_mtu_maybe_forward() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: f87c10a8aa1e8 ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 9f5e33e371fcd..ba7b43447775e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); - struct net *net = dev_net(dst->dev); - unsigned int mtu; + unsigned int mtu, res; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + res = mtu - lwtunnel_headroom(dst->lwtstate, mtu); + + rcu_read_unlock(); + + return res; } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, -- GitLab From 71b8471c93fa0bcab911fcb65da1eb6c4f5f735f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:12 +0000 Subject: [PATCH 311/989] ipv4: use RCU protection in ipv4_default_advmss() ipv4_default_advmss() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: 2e9589ff809e ("ipv4: Namespaceify min_adv_mss sysctl knob") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 577b88a43293a..74c074f45758b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1307,10 +1307,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } -- GitLab From dd205fcc33d92d54eee4d7f21bb073af9bd5ce2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:13 +0000 Subject: [PATCH 312/989] ipv4: use RCU protection in rt_is_expired() rt_is_expired() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: e84f84f27647 ("netns: place rt_genid into struct net") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 74c074f45758b..e959327c0ba89 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) -- GitLab From 719817cd293e4fa389e1f69c396f3f816ed5aa41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:14 +0000 Subject: [PATCH 313/989] ipv4: use RCU protection in inet_select_addr() inet_select_addr() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: c4544c724322 ("[NETNS]: Process inet_select_addr inside a namespace.") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250205155120.1676781-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c8b3cf5fba4c0..55b8151759bc9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1371,10 +1371,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; - struct net *net = dev_net(dev); + struct net *net; int master_idx; rcu_read_lock(); + net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; -- GitLab From 139512191bd06f1b496117c76372b2ce372c9a41 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:15 +0000 Subject: [PATCH 314/989] ipv4: use RCU protection in __ip_rt_update_pmtu() __ip_rt_update_pmtu() must use RCU protection to make sure the net structure it reads does not disappear. Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device") Fixes: 1de6b15a434c ("Namespaceify min_pmtu sysctl") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250205155120.1676781-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e959327c0ba89..753704f75b2c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1008,9 +1008,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1020,6 +1020,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1027,9 +1029,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; @@ -1043,14 +1044,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } - rcu_read_unlock(); - return; + goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } -- GitLab From 4b8474a0951e605d2a27a2c483da4eb4b8c63760 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:16 +0000 Subject: [PATCH 315/989] ipv4: icmp: convert to dev_net_rcu() __icmp_send() must ensure rcu_read_lock() is held, as spotted by Jakub. Other ICMP uses of dev_net() seem safe, change them to dev_net_rcu() to get LOCKDEP support. Fixes: dde1bc0e6f86 ("[NETNS]: Add namespace for ICMP replying code.") Closes: https://lore.kernel.org/netdev/20250203153633.46ce0337@kernel.org/ Reported-by: Jakub Kicinski Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250205155120.1676781-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 094084b61bff8..5482edb5aade2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,10 +399,10 @@ static void icmp_push_reply(struct sock *sk, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -608,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct sock *sk; if (!rt) - goto out; + return; + + rcu_read_lock(); if (rt->dst.dev) - net = dev_net(rt->dst.dev); + net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) - net = dev_net(skb_in->dev); + net = dev_net_rcu(skb_in->dev); else goto out; @@ -785,7 +787,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); -out:; +out: + rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); @@ -834,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } @@ -868,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? @@ -979,7 +982,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1011,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1040,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) { + struct net *net = dev_net_rcu(skb->dev); struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; - struct net *net = dev_net(skb->dev); struct inet6_dev *in6_dev; struct in_device *in_dev; struct net_device *dev; @@ -1181,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1198,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { @@ -1371,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those -- GitLab From afec62cd0a4191cde6dd3a75382be4d51a38ce9b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:17 +0000 Subject: [PATCH 316/989] flow_dissector: use RCU protection to fetch dev_net() __skb_flow_dissect() can be called from arbitrary contexts. It must extend its RCU protection section to include the call to dev_net(), which can become dev_net_rcu(). This makes sure the net structure can not disappear under us. Fixes: 9b52e3f267a6 ("flow_dissector: handle no-skb use case") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0e638a37aa096..5db41bf2ed93e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1108,10 +1108,12 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } @@ -1122,7 +1124,6 @@ bool __skb_flow_dissect(const struct net *net, enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1150,17 +1151,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); -- GitLab From 3c8ffcd248da34fc41e52a46e51505900115fc2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:18 +0000 Subject: [PATCH 317/989] ipv6: use RCU protection in ip6_default_advmss() ip6_default_advmss() needs rcu protection to make sure the net structure it reads does not disappear. Fixes: 5578689a4e3c ("[NETNS][IPV6] route6 - make route6 per namespace") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-11-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 78362822b9070..ef2d23a1e3d53 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3196,13 +3196,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. -- GitLab From 34aef2b0ce3aa4eb4ef2e1f5cad3738d527032f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:19 +0000 Subject: [PATCH 318/989] ipv6: icmp: convert to dev_net_rcu() icmp6_send() must acquire rcu_read_lock() sooner to ensure the dev_net() call done from a safe context. Other ICMPv6 uses of dev_net() seem safe, change them to dev_net_rcu() to get LOCKDEP support to catch bugs. Fixes: 9a43b709a230 ("[NETNS][IPV6] icmp6 - make icmpv6_socket per namespace") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250205155120.1676781-12-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a6984a29fdb9d..4d14ab7f7e99f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ @@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) - goto out; + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, @@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; @@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; @@ -889,7 +893,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -1034,9 +1038,9 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; -- GitLab From b768294d449da6d7dc0667c1ec92dc4af6ef766b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:20 +0000 Subject: [PATCH 319/989] ipv6: Use RCU in ip6_input() Instead of grabbing rcu_read_lock() from ip6_input_finish(), do it earlier in is caller, so that ip6_input() access to dev_net() can be validated by LOCKDEP. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250205155120.1676781-13-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_input.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 70c0e16c0ae68..39da6a7ce5f12 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -477,9 +477,7 @@ void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -487,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); -- GitLab From 6a774228e890ee04a0ee13f4e6e731ec8554b9c2 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 5 Feb 2025 12:03:01 +0100 Subject: [PATCH 320/989] net: ethtool: tsconfig: Fix netlink type of hwtstamp flags Fix the netlink type for hardware timestamp flags, which are represented as a bitset of flags. Although only one flag is supported currently, the correct netlink bitset type should be used instead of u32 to keep consistency with other fields. Address this by adding a new named string set description for the hwtstamp flag structure. The code has been introduced in the current release so the uAPI change is still okay. Signed-off-by: Kory Maincent Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Link: https://patch.msgid.link/20250205110304.375086-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 ++- include/uapi/linux/ethtool.h | 2 ++ net/ethtool/common.c | 5 ++++ net/ethtool/common.h | 2 ++ net/ethtool/strset.c | 5 ++++ net/ethtool/tsconfig.c | 33 +++++++++++++++++------- 6 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 259cb211a3382..655d8d10fe248 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1524,7 +1524,8 @@ attribute-sets: nested-attributes: bitset - name: hwtstamp-flags - type: u32 + type: nest + nested-attributes: bitset operations: enum-model: directional diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index d1089b88efc7d..9b18c4cfe56f8 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -682,6 +682,7 @@ enum ethtool_link_ext_substate_module { * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * @ETH_SS_STATS_RMON: names of RMON statistics * @ETH_SS_STATS_PHY: names of PHY(dev) statistics + * @ETH_SS_TS_FLAGS: hardware timestamping flags * * @ETH_SS_COUNT: number of defined string sets */ @@ -708,6 +709,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_CTRL, ETH_SS_STATS_RMON, ETH_SS_STATS_PHY, + ETH_SS_TS_FLAGS, /* add new constants above here */ ETH_SS_COUNT diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2bd77c94f9f1a..d88e9080643b8 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -462,6 +462,11 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char ts_flags_names[][ETH_GSTRING_LEN] = { + [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index", +}; +static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT); + const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 850eadde4bfcc..58e9e7db06f90 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -13,6 +13,7 @@ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) +#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1) struct link_mode_info { int speed; @@ -38,6 +39,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char ts_flags_names[][ETH_GSTRING_LEN]; extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 818cf01f09110..6b76c05caba4d 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_TS_FLAGS] = { + .per_dev = false, + .count = __HWTSTAMP_FLAG_CNT, + .strings = ts_flags_names, + }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 9188e088fb2f9..2be356bdfe873 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -54,7 +54,7 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, data->hwtst_config.tx_type = BIT(cfg.tx_type); data->hwtst_config.rx_filter = BIT(cfg.rx_filter); - data->hwtst_config.flags = BIT(cfg.flags); + data->hwtst_config.flags = cfg.flags; data->hwprov_desc.index = -1; hwprov = rtnl_dereference(dev->hwprov); @@ -91,10 +91,16 @@ static int tsconfig_reply_size(const struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); - if (data->hwtst_config.flags) - /* _TSCONFIG_HWTSTAMP_FLAGS */ - len += nla_total_size(sizeof(u32)); + if (data->hwtst_config.flags) { + ret = ethnl_bitset32_size(&data->hwtst_config.flags, + NULL, __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ + } if (data->hwtst_config.tx_type) { ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, @@ -130,8 +136,10 @@ static int tsconfig_fill_reply(struct sk_buff *skb, int ret; if (data->hwtst_config.flags) { - ret = nla_put_u32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, - data->hwtst_config.flags); + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + &data->hwtst_config.flags, NULL, + __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); if (ret < 0) return ret; } @@ -180,7 +188,7 @@ const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), - [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_U32 }, + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, }; @@ -296,6 +304,7 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (!netif_device_present(dev)) return -ENODEV; @@ -377,9 +386,13 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, } if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { - ethnl_update_u32(&hwtst_config.flags, - tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], - &config_mod); + ret = ethnl_update_bitset32(&hwtst_config.flags, + __HWTSTAMP_FLAG_CNT, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + ts_flags_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; } ret = net_hwtstamp_validate(&hwtst_config); -- GitLab From bca0902e61731a75fc4860c8720168d9f1bae3b6 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Mon, 3 Feb 2025 12:12:03 +0300 Subject: [PATCH 321/989] ax25: Fix refcount leak caused by setting SO_BINDTODEVICE sockopt If an AX25 device is bound to a socket by setting the SO_BINDTODEVICE socket option, a refcount leak will occur in ax25_release(). Commit 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") added decrement of device refcounts in ax25_release(). In order for that to work correctly the refcounts must already be incremented when the device is bound to the socket. An AX25 device can be bound to a socket by either calling ax25_bind() or setting SO_BINDTODEVICE socket option. In both cases the refcounts should be incremented, but in fact it is done only in ax25_bind(). This bug leads to the following issue reported by Syzkaller: ================================================================ refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 1 PID: 5932 at lib/refcount.c:31 refcount_warn_saturate+0x1ed/0x210 lib/refcount.c:31 Modules linked in: CPU: 1 UID: 0 PID: 5932 Comm: syz-executor424 Not tainted 6.13.0-rc4-syzkaller-00110-g4099a71718b0 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:refcount_warn_saturate+0x1ed/0x210 lib/refcount.c:31 Call Trace: __refcount_dec include/linux/refcount.h:336 [inline] refcount_dec include/linux/refcount.h:351 [inline] ref_tracker_free+0x710/0x820 lib/ref_tracker.c:236 netdev_tracker_free include/linux/netdevice.h:4156 [inline] netdev_put include/linux/netdevice.h:4173 [inline] netdev_put include/linux/netdevice.h:4169 [inline] ax25_release+0x33f/0xa10 net/ax25/af_ax25.c:1069 __sock_release+0xb0/0x270 net/socket.c:640 sock_close+0x1c/0x30 net/socket.c:1408 ... do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... ================================================================ Fix the implementation of ax25_setsockopt() by adding increment of refcounts for the new device bound, and decrement of refcounts for the old unbound device. Fixes: 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") Reported-by: syzbot+33841dc6aa3e1d86b78a@syzkaller.appspotmail.com Signed-off-by: Murad Masimov Link: https://patch.msgid.link/20250203091203.1744-1-m.masimov@mt-integration.ru Signed-off-by: Jakub Kicinski --- net/ax25/af_ax25.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index aa6c714892ec9..9f3b8b682adb2 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -685,6 +685,15 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } + if (ax25->ax25_dev) { + if (dev == ax25->ax25_dev->dev) { + rcu_read_unlock(); + break; + } + netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker); + ax25_dev_put(ax25->ax25_dev); + } + ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { rcu_read_unlock(); @@ -692,6 +701,8 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } ax25_fillin_cb(ax25, ax25->ax25_dev); + netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25->ax25_dev); rcu_read_unlock(); break; -- GitLab From 1438f5d07b9a7afb15e1d0e26df04a6fd4e56a3c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 5 Feb 2025 23:10:37 +0100 Subject: [PATCH 322/989] rtnetlink: fix netns leak with rtnl_setlink() A call to rtnl_nets_destroy() is needed to release references taken on netns put in rtnl_nets. CC: stable@vger.kernel.org Fixes: 636af13f213b ("rtnetlink: Register rtnl_dellink() and rtnl_setlink() with RTNL_FLAG_DOIT_PERNET_WIP.") Signed-off-by: Nicolas Dichtel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205221037.2474426-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1f4d4b5570ab8..d1e559fce918d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3432,6 +3432,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } -- GitLab From cb7380de9e4cbc9a24216b722ec50e092ae83036 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 12:32:49 -0800 Subject: [PATCH 323/989] compiler.h: Move C string helpers into C-only kernel section The C kernel helpers for evaluating C Strings were positioned where they were visible to assembly inclusion, which was not intended. Move them into the kernel and C-only area of the header so future changes won't confuse the assembler. Fixes: d7a516c6eeae ("compiler.h: Fix undefined BUILD_BUG_ON_ZERO()") Fixes: 559048d156ff ("string: Check for "nonstring" attribute on strscpy() arguments") Reviewed-by: Miguel Ojeda Signed-off-by: Kees Cook --- include/linux/compiler.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b957..7af999a131cb2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -214,6 +214,19 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +#else /* __CHECKER__ */ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#endif /* __CHECKER__ */ + +/* &a[0] degrades to a pointer: a different type from an array */ +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") + +/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ +#define __must_be_cstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + #endif /* __KERNEL__ */ /** @@ -254,19 +267,6 @@ static inline void *offset_to_ptr(const int *off) #define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) -#ifdef __CHECKER__ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) -#else /* __CHECKER__ */ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) -#endif /* __CHECKER__ */ - -/* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") - -/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ -#define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") - /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. -- GitLab From 20e5cc26e56db09cc612721f90b4994cce5e5b7b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 12:48:07 -0800 Subject: [PATCH 324/989] compiler.h: Introduce __must_be_byte_array() In preparation for adding stricter type checking to the str/mem*() helpers, provide a way to check that a variable is a byte array via __must_be_byte_array(). Suggested-by: Kent Overstreet Signed-off-by: Kees Cook --- include/linux/compiler.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7af999a131cb2..1c0688319435d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -221,7 +221,13 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __CHECKER__ */ /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") +#define __is_array(a) (!__same_type((a), &(a)[0])) +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_array(a), \ + "must be array") + +#define __is_byte_array(a) (__is_array(a) && sizeof((a)[0]) == 1) +#define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ + "must be byte array") /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ #define __must_be_cstr(p) \ -- GitLab From 6270f4deba3fbd77d1717fb8634f1fc612ff69e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 13:45:26 -0800 Subject: [PATCH 325/989] string.h: Use ARRAY_SIZE() for memtostr*()/strtomem*() The destination argument of memtostr*() and strtomem*() must be a fixed-size char array at compile time, so there is no need to use __builtin_object_size() (which is useful for when an argument is either a pointer or unknown). Instead use ARRAY_SIZE(), which has the benefit of working around a bug in Clang (fixed[1] in 15+) that got __builtin_object_size() wrong sometimes. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501310832.kiAeOt2z-lkp@intel.com/ Suggested-by: Kent Overstreet Link: https://github.com/llvm/llvm-project/commit/d8e0a6d5e9dd2311641f9a8a5d2bf90829951ddc [1] Tested-by: Suren Baghdasaryan Signed-off-by: Kees Cook --- include/linux/string.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c777..fc5ae145bd78f 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -411,7 +411,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem_pad(dest, src, pad) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -434,7 +435,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -453,7 +455,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ @@ -478,7 +481,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr_pad(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ -- GitLab From fdfd0ad82890f678398ee670c4e59747738540e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 1 Feb 2025 12:56:51 -0500 Subject: [PATCH 326/989] bcachefs docs: SubmittingPatches.rst Add an (initial?) patch submission checklist, focusing mainly on testing. Yes, all patches must be tested, and that starts (but does not end) with the patch author. Signed-off-by: Kent Overstreet --- .../bcachefs/SubmittingPatches.rst | 98 +++++++++++++++++++ Documentation/filesystems/bcachefs/index.rst | 1 + MAINTAINERS | 1 + 3 files changed, 100 insertions(+) create mode 100644 Documentation/filesystems/bcachefs/SubmittingPatches.rst diff --git a/Documentation/filesystems/bcachefs/SubmittingPatches.rst b/Documentation/filesystems/bcachefs/SubmittingPatches.rst new file mode 100644 index 0000000000000..026b12ae0d6a2 --- /dev/null +++ b/Documentation/filesystems/bcachefs/SubmittingPatches.rst @@ -0,0 +1,98 @@ +Submitting patches to bcachefs: +=============================== + +Patches must be tested before being submitted, either with the xfstests suite +[0], or the full bcachefs test suite in ktest [1], depending on what's being +touched. Note that ktest wraps xfstests and will be an easier method to running +it for most users; it includes single-command wrappers for all the mainstream +in-kernel local filesystems. + +Patches will undergo more testing after being merged (including +lockdep/kasan/preempt/etc. variants), these are not generally required to be +run by the submitter - but do put some thought into what you're changing and +which tests might be relevant, e.g. are you dealing with tricky memory layout +work? kasan, are you doing locking work? then lockdep; and ktest includes +single-command variants for the debug build types you'll most likely need. + +The exception to this rule is incomplete WIP/RFC patches: if you're working on +something nontrivial, it's encouraged to send out a WIP patch to let people +know what you're doing and make sure you're on the right track. Just make sure +it includes a brief note as to what's done and what's incomplete, to avoid +confusion. + +Rigorous checkpatch.pl adherence is not required (many of its warnings are +considered out of date), but try not to deviate too much without reason. + +Focus on writing code that reads well and is organized well; code should be +aesthetically pleasing. + +CI: +=== + +Instead of running your tests locally, when running the full test suite it's +prefereable to let a server farm do it in parallel, and then have the results +in a nice test dashboard (which can tell you which failures are new, and +presents results in a git log view, avoiding the need for most bisecting). + +That exists [2], and community members may request an account. If you work for +a big tech company, you'll need to help out with server costs to get access - +but the CI is not restricted to running bcachefs tests: it runs any ktest test +(which generally makes it easy to wrap other tests that can run in qemu). + +Other things to think about: +============================ + +- How will we debug this code? Is there sufficient introspection to diagnose + when something starts acting wonky on a user machine? + + We don't necessarily need every single field of every data structure visible + with introspection, but having the important fields of all the core data + types wired up makes debugging drastically easier - a bit of thoughtful + foresight greatly reduces the need to have people build custom kernels with + debug patches. + + More broadly, think about all the debug tooling that might be needed. + +- Does it make the codebase more or less of a mess? Can we also try to do some + organizing, too? + +- Do new tests need to be written? New assertions? How do we know and verify + that the code is correct, and what happens if something goes wrong? + + We don't yet have automated code coverage analysis or easy fault injection - + but for now, pretend we did and ask what they might tell us. + + Assertions are hugely important, given that we don't yet have a systems + language that can do ergonomic embedded correctness proofs. Hitting an assert + in testing is much better than wandering off into undefined behaviour la-la + land - use them. Use them judiciously, and not as a replacement for proper + error handling, but use them. + +- Does it need to be performance tested? Should we add new peformance counters? + + bcachefs has a set of persistent runtime counters which can be viewed with + the 'bcachefs fs top' command; this should give users a basic idea of what + their filesystem is currently doing. If you're doing a new feature or looking + at old code, think if anything should be added. + +- If it's a new on disk format feature - have upgrades and downgrades been + tested? (Automated tests exists but aren't in the CI, due to the hassle of + disk image management; coordinate to have them run.) + +Mailing list, IRC: +================== + +Patches should hit the list [3], but much discussion and code review happens on +IRC as well [4]; many people appreciate the more conversational approach and +quicker feedback. + +Additionally, we have a lively user community doing excellent QA work, which +exists primarily on IRC. Please make use of that resource; user feedback is +important for any nontrivial feature, and documenting it in commit messages +would be a good idea. + +[0]: git://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git +[1]: https://evilpiepirate.org/git/ktest.git/ +[2]: https://evilpiepirate.org/~testdashboard/ci/ +[3]: linux-bcachefs@vger.kernel.org +[4]: irc.oftc.net#bcache, #bcachefs-dev diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst index 95fc4b90739ed..7db4d7ceab582 100644 --- a/Documentation/filesystems/bcachefs/index.rst +++ b/Documentation/filesystems/bcachefs/index.rst @@ -9,4 +9,5 @@ bcachefs Documentation :numbered: CodingStyle + SubmittingPatches errorcodes diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa0654..c40d3d0c68c70 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3955,6 +3955,7 @@ M: Kent Overstreet L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache +P: Documentation/filesystems/bcachefs/SubmittingPatches.rst T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ F: Documentation/filesystems/bcachefs/ -- GitLab From 6b37037d6d1b42083642340efcf80f7a30203039 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sat, 1 Feb 2025 01:20:31 +0900 Subject: [PATCH 327/989] bcachefs: fix incorrect pointer check in __bch2_subvolume_delete() For some unknown reason, checks on struct bkey_s_c_snapshot and struct bkey_s_c_snapshot_tree pointers are missing. Therefore, I think it would be appropriate to fix the incorrect pointer checking through this patch. Fixes: 4bd06f07bcb5 ("bcachefs: Fixes for snapshot_tree.master_subvol") Signed-off-by: Jeongjun Park Signed-off-by: Kent Overstreet --- fs/bcachefs/subvolume.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index e3d0475232e53..b7b96283c3161 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -428,7 +428,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, POS(0, snapid), 0, snapshot); - ret = bkey_err(subvol); + ret = bkey_err(snapshot); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", snapid); if (ret) @@ -440,6 +440,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, BTREE_ID_snapshot_trees, POS(0, treeid), 0, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) + goto err; if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { struct bkey_i_snapshot_tree *snapshot_tree_mut = -- GitLab From 2ef995df0ce592f665d312008dbe1ad1c4bcf87f Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 2 Feb 2025 15:13:51 +0900 Subject: [PATCH 328/989] bcachefs: fix deadlock in journal_entry_open() In the previous commit b3d82c2f2761, code was added to prevent journal sequence overflow. Among them, the code added to journal_entry_open() uses the bch2_fs_fatal_err_on() function to handle errors. However, __journal_res_get() , which calls journal_entry_open() , calls journal_entry_open() while holding journal->lock , but bch2_fs_fatal_err_on() internally tries to acquire journal->lock , which results in a deadlock. So we need to add a locked helper to handle fatal errors even when the journal->lock is held. Fixes: b3d82c2f2761 ("bcachefs: Guard against journal seq overflow") Signed-off-by: Jeongjun Park Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 +++++++++++++++-- fs/bcachefs/journal.h | 1 + fs/bcachefs/super.c | 11 +++++++++++ fs/bcachefs/super.h | 1 + 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index cb2c3722f6741..0a943a27ef449 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -319,6 +319,16 @@ void bch2_journal_halt(struct journal *j) spin_unlock(&j->lock); } +void bch2_journal_halt_locked(struct journal *j) +{ + lockdep_assert_held(&j->lock); + + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -381,9 +391,12 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; - if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, - c, "cannot start: journal seq overflow")) + if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + } BUG_ON(!j->cur_entry_sectors); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index dccddd5420adf..107f7f901cd96 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -409,6 +409,7 @@ bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); +void bch2_journal_halt_locked(struct journal *); static inline int bch2_journal_error(struct journal *j) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d97ea7bd1171b..6d97d412fed98 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -411,6 +411,17 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index fa6d522165108..04f8287eff5c3 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -29,6 +29,7 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); -- GitLab From 9e9033522ad1e4bb697c9493aa449630fa2c98d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 Jan 2025 01:21:44 -0500 Subject: [PATCH 329/989] bcachefs: Fix discard path journal flushing The discard path is supposed to issue journal flushes when there's too many buckets empty buckets that need a journal commit before they can be written to again, but at some point this code seems to have been lost. Bring it back with a new optimization to make sure we don't issue too many journal flushes: the journal now tracks the sequence number of the most recent flush in progress, which the discard path uses when deciding which buckets need a journal flush. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 47 ++++++++++++----------- fs/bcachefs/alloc_foreground.c | 10 +++-- fs/bcachefs/alloc_types.h | 1 + fs/bcachefs/buckets_waiting_for_journal.c | 12 +++--- fs/bcachefs/buckets_waiting_for_journal.h | 4 +- fs/bcachefs/journal.c | 1 + fs/bcachefs/journal_types.h | 1 + fs/bcachefs/trace.h | 14 ++++++- 8 files changed, 55 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fc2ef33b67b38..3ea809990ef1a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1803,7 +1803,6 @@ struct discard_buckets_state { u64 open; u64 need_journal_commit; u64 discarded; - u64 need_journal_commit_this_dev; }; static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1827,11 +1826,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - pos.inode, pos.offset)) { - s->need_journal_commit++; - s->need_journal_commit_this_dev++; + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + pos.inode, pos.offset); + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; goto out; } @@ -1865,23 +1864,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, discard_locked = true; } - if (!bkey_eq(*discard_pos_done, iter.pos) && - ca->mi.discard && !c->opts.nochanges) { - /* - * This works without any other locks because this is the only - * thread that removes items from the need_discard tree - */ - bch2_trans_unlock_long(trans); - blkdev_issue_discard(ca->disk_sb.bdev, - k.k->p.offset * ca->mi.bucket_size, - ca->mi.bucket_size, - GFP_KERNEL); - *discard_pos_done = iter.pos; + if (!bkey_eq(*discard_pos_done, iter.pos)) { s->discarded++; + *discard_pos_done = iter.pos; - ret = bch2_trans_relock_notrace(trans); - if (ret) - goto out; + if (ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock_long(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); @@ -1929,6 +1929,9 @@ static void bch2_do_discards_work(struct work_struct *work) POS(ca->dev_idx, U64_MAX), 0, k, bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); + if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal)) + bch2_journal_flush_async(&c->journal, NULL); + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -2024,7 +2027,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } - trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6df41c331a52e..5a781fb4c794b 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -205,8 +205,12 @@ static inline bool may_alloc_bucket(struct bch_fs *c, return false; } - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + u64 journal_seq_ready = + bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; s->skipped_need_journal_commit++; return false; } @@ -570,7 +574,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); - if (s.skipped_need_journal_commit * 2 > avail) + if (s.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 9bbb28e90b934..4aa8ee026cb84 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -18,6 +18,7 @@ struct bucket_alloc_state { u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 need_journal_commit; u64 skipped_nocow; u64 skipped_nouse; u64 skipped_mi_btree_bitmap; diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index f9fb150eda706..c8a488e6b7b86 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ memset(t->d, 0, sizeof(t->d[0]) << t->bits); } -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, - u64 flushed_seq, - unsigned dev, u64 bucket) +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) { struct buckets_waiting_for_journal_table *t; u64 dev_bucket = (u64) dev << 56 | bucket; - bool ret = false; - unsigned i; + u64 ret = 0; mutex_lock(&b->lock); t = b->t; - for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); if (h->dev_bucket == dev_bucket) { - ret = h->journal_seq > flushed_seq; + ret = h->journal_seq; break; } } diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h index d2ae19cbe18c4..365619ca44c87 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.h +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -4,8 +4,8 @@ #include "buckets_waiting_for_journal_types.h" -bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, - u64, unsigned, u64); +u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *, + unsigned, u64); int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, u64, unsigned, u64, u64); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 0a943a27ef449..24c294d4634e0 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -796,6 +796,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, } buf->must_flush = true; + j->flushing_seq = max(j->flushing_seq, seq); if (parent && !closure_wait(&buf->wait, parent)) BUG(); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 3ba433a48eb8a..a198a81d74784 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -237,6 +237,7 @@ struct journal { /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; + u64 flushing_seq; u64 last_seq_ondisk; u64 err_seq; u64 last_empty_seq; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 56a5a7fbc0fd1..c1b51009edf6b 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -727,7 +727,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail, TP_ARGS(c, str) ); -TRACE_EVENT(discard_buckets, +DECLARE_EVENT_CLASS(discard_buckets_class, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), TP_ARGS(c, seen, open, need_journal_commit, discarded, err), @@ -759,6 +759,18 @@ TRACE_EVENT(discard_buckets, __entry->err) ); +DEFINE_EVENT(discard_buckets_class, discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + +DEFINE_EVENT(discard_buckets_class, discard_buckets_fast, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err) +); + TRACE_EVENT(bucket_invalidate, TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), TP_ARGS(c, dev, bucket, sectors), -- GitLab From 3539880ef1a5f8c970d0f69a6fdcfeffc000e63d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 3 Feb 2025 11:35:11 -0500 Subject: [PATCH 330/989] bcachefs: Fix rcu imbalance in bch2_fs_btree_key_cache_exit() Spotted by sparse. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index c378b97ebeca7..1821f40c161a1 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -748,7 +748,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_unlock(); mutex_lock(&bc->table.mutex); mutex_unlock(&bc->table.mutex); - rcu_read_lock(); continue; } for (i = 0; i < tbl->size; i++) -- GitLab From 4be214c26936813b636eed2fac906f585ddbf0f9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Jan 2025 21:29:45 -0500 Subject: [PATCH 331/989] bcachefs: bch2_bkey_sectors_need_rebalance() now only depends on bch_extent_rebalance Previously, bch2_bkey_sectors_need_rebalance() called bch2_target_accepts_data(), checking whether the target is writable. However, this means that adding or removing devices from a target would change the value of bch2_bkey_sectors_need_rebalance() for an existing extent; this needs to be invariant so that the extent trigger can correctly maintain rebalance_work accounting. Instead, check target_accepts_data() in io_opts_to_rebalance_opts(), before creating the bch_extent_rebalance entry. This fixes (one?) cause of rebalance_work accounting being off. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.h | 4 +++- fs/bcachefs/opts.h | 14 -------------- fs/bcachefs/rebalance.c | 8 +++----- fs/bcachefs/rebalance.h | 20 ++++++++++++++++++++ 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index d2e134528f0e6..428b9be6af34b 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -285,12 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +#include "rebalance.h" + static inline struct bch_extent_rebalance bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) { struct bch_io_opts io_opts; bch2_inode_opts_get(&io_opts, c, inode); - return io_opts_to_rebalance_opts(&io_opts); + return io_opts_to_rebalance_opts(c, &io_opts); } int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index a182b5d454ba6..9d397fc2a1f05 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -659,18 +659,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); -/* rebalance opts: */ - -static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) -{ - return (struct bch_extent_rebalance) { - .type = BIT(BCH_EXTENT_ENTRY_rebalance), -#define x(_name) \ - ._name = opts->_name, \ - ._name##_from_inode = opts->_name##_from_inode, - BCH_REBALANCE_OPTS() -#undef x - }; -}; - #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4adc74cd3f70b..d0a1f5cd5c2b3 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -121,12 +121,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + if (opts->background_target) bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - } return sectors; } @@ -140,7 +138,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts); return old == NULL || memcmp(old, &new, sizeof(new)); } else { return old != NULL; @@ -163,7 +161,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, k.k->u64s += sizeof(*old) / sizeof(u64); } - *old = io_opts_to_rebalance_opts(opts); + *old = io_opts_to_rebalance_opts(c, opts); } else { if (old) extent_entry_drop(k, (union bch_extent_entry *) old); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 0a0821ab895d8..62a3859d3823f 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -4,8 +4,28 @@ #include "compress.h" #include "disk_groups.h" +#include "opts.h" #include "rebalance_types.h" +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c, + struct bch_io_opts *opts) +{ + struct bch_extent_rebalance r = { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; + + if (r.background_target && + !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target)) + r.background_target = 0; + + return r; +}; + u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); int bch2_get_update_rebalance_opts(struct btree_trans *, -- GitLab From 29a61a1f40637ae010b828745fb41f60301c3a3d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Feb 2025 15:22:56 +0100 Subject: [PATCH 332/989] genirq: Remove leading space from irq_chip::irq_print_chip() callbacks The space separator was factored out from the multiple chip name prints, but several irq_chip::irq_print_chip() callbacks still print a leading space. Remove the superfluous double spaces. Fixes: 9d9f204bdf7243bf ("genirq/proc: Add missing space separator back") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/893f7e9646d8933cd6786d5a1ef3eb076d263768.1738764803.git.geert+renesas@glider.be --- arch/powerpc/sysdev/fsl_msi.c | 2 +- drivers/bus/moxtet.c | 2 +- drivers/irqchip/irq-partition-percpu.c | 2 +- drivers/soc/qcom/smp2p.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 1aa0cb097c9c9..7b9a5ea9cad9d 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -75,7 +75,7 @@ static void fsl_msi_print_chip(struct irq_data *irqd, struct seq_file *p) srs = (hwirq >> msi_data->srs_shift) & MSI_SRS_MASK; cascade_virq = msi_data->cascade_array[srs]->virq; - seq_printf(p, " fsl-msi-%d", cascade_virq); + seq_printf(p, "fsl-msi-%d", cascade_virq); } diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c index 6276551d79680..1e57ebfb76229 100644 --- a/drivers/bus/moxtet.c +++ b/drivers/bus/moxtet.c @@ -657,7 +657,7 @@ static void moxtet_irq_print_chip(struct irq_data *d, struct seq_file *p) id = moxtet->modules[pos->idx]; - seq_printf(p, " moxtet-%s.%i#%i", mox_module_name(id), pos->idx, + seq_printf(p, "moxtet-%s.%i#%i", mox_module_name(id), pos->idx, pos->bit); } diff --git a/drivers/irqchip/irq-partition-percpu.c b/drivers/irqchip/irq-partition-percpu.c index 8e76d2913e6be..4441ffe149ea0 100644 --- a/drivers/irqchip/irq-partition-percpu.c +++ b/drivers/irqchip/irq-partition-percpu.c @@ -98,7 +98,7 @@ static void partition_irq_print_chip(struct irq_data *d, struct seq_file *p) struct irq_chip *chip = irq_desc_get_chip(part->chained_desc); struct irq_data *data = irq_desc_get_irq_data(part->chained_desc); - seq_printf(p, " %5s-%lu", chip->name, data->hwirq); + seq_printf(p, "%5s-%lu", chip->name, data->hwirq); } static struct irq_chip partition_irq_chip = { diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c index 4783ab1adb8d9..a3e88ced328a9 100644 --- a/drivers/soc/qcom/smp2p.c +++ b/drivers/soc/qcom/smp2p.c @@ -365,7 +365,7 @@ static void smp2p_irq_print_chip(struct irq_data *irqd, struct seq_file *p) { struct smp2p_entry *entry = irq_data_get_irq_chip_data(irqd); - seq_printf(p, " %8s", dev_name(entry->smp2p->dev)); + seq_printf(p, "%8s", dev_name(entry->smp2p->dev)); } static struct irq_chip smp2p_irq_chip = { -- GitLab From 868c9037df626b3c245ee26a290a03ae1f9f58d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 5 Feb 2025 17:02:20 +0100 Subject: [PATCH 333/989] timers/migration: Fix off-by-one root mis-connection Before attaching a new root to the old root, the children counter of the new root is checked to verify that only the upcoming CPU's top group have been connected to it. However since the recently added commit b729cc1ec21a ("timers/migration: Fix another race between hotplug and idle entry/exit") this check is not valid anymore because the old root is pre-accounted as a child to the new root. Therefore after connecting the upcoming CPU's top group to the new root, the children count to be expected must be 2 and not 1 anymore. This omission results in the old root to not be connected to the new root. Then eventually the system may run with more than one top level, which defeats the purpose of a single idle migrator. Also the old root is pre-accounted but not connected upon the new root creation. But it can be connected to the new root later on. Therefore the old root may be accounted twice to the new root. The propagation of such overcommit can end up creating a double final top-level root with a groupmask incorrectly initialized. Although harmless given that the final top level roots will never have a parent to walk up to, this oddity opportunistically reported the core issue: WARNING: CPU: 8 PID: 0 at kernel/time/timer_migration.c:543 tmigr_requires_handle_remote CPU: 8 UID: 0 PID: 0 Comm: swapper/8 RIP: 0010:tmigr_requires_handle_remote Call Trace: ? tmigr_requires_handle_remote ? hrtimer_run_queues update_process_times tick_periodic tick_handle_periodic __sysvec_apic_timer_interrupt sysvec_apic_timer_interrupt Fix the problem by taking the old root into account in the children count of the new root so the connection is not omitted. Also warn when more than one top level group exists to better detect similar issues in the future. Fixes: b729cc1ec21a ("timers/migration: Fix another race between hotplug and idle entry/exit") Reported-by: Matt Fleming Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250205160220.39467-1-frederic@kernel.org --- kernel/time/timer_migration.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 9cb9b6584ea18..2f6330831f084 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1675,6 +1675,9 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) } while (i < tmigr_hierarchy_levels); + /* Assert single root */ + WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + while (i > 0) { group = stack[--i]; @@ -1716,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { + + /* + * Newly created root level should have accounted the upcoming + * CPU's child group and pre-accounted the old root. + */ + if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise -- GitLab From 0fac3ed473dd2955053be6671cdd747807f5e488 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Sun, 19 Jan 2025 10:59:47 +0800 Subject: [PATCH 334/989] fs/stat.c: avoid harmless garbage value problem in vfs_statx_path() Clang static checker(scan-build) warning: fs/stat.c:287:21: warning: The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage. 287 | stat->result_mask |= STATX_MNT_ID_UNIQUE; | ~~~~~~~~~~~~~~~~~ ^ fs/stat.c:290:21: warning: The left expression of the compound assignment is an uninitialized value. The computed value will also be garbage. 290 | stat->result_mask |= STATX_MNT_ID; When vfs_getattr() failed because of security_inode_getattr(), 'stat' is uninitialized. In this case, there is a harmless garbage problem in vfs_statx_path(). It's better to return error directly when vfs_getattr() failed, avoiding garbage value and more clearly. Signed-off-by: Su Hui Link: https://lore.kernel.org/r/20250119025946.1168957-1-suhui@nfschina.com Signed-off-by: Christian Brauner --- fs/stat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/stat.c b/fs/stat.c index 2c0e111a098a1..f13308bfdc983 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -281,6 +281,8 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); + if (error) + return error; if (request_mask & STATX_MNT_ID_UNIQUE) { stat->mnt_id = real_mount(path->mnt)->mnt_id_unique; @@ -302,7 +304,7 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (S_ISBLK(stat->mode)) bdev_statx(path, stat, request_mask); - return error; + return 0; } static int vfs_statx_fd(int fd, int flags, struct kstat *stat, -- GitLab From 4e7487245abcbc5a1a1aea54e4d3b33c53804bda Mon Sep 17 00:00:00 2001 From: Brahmajit Das Date: Tue, 21 Jan 2025 21:56:48 +0530 Subject: [PATCH 335/989] vboxsf: fix building with GCC 15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with GCC 15 results in build error fs/vboxsf/super.c:24:54: error: initializer-string for array of ‘unsigned char’ is too long [-Werror=unterminated-string-initialization] 24 | static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; | ^~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors Due to GCC having enabled -Werror=unterminated-string-initialization[0] by default. Separately initializing each array element of VBSF_MOUNT_SIGNATURE to ensure NUL termination, thus satisfying GCC 15 and fixing the build error. [0]: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html#index-Wno-unterminated-string-initialization Signed-off-by: Brahmajit Das Link: https://lore.kernel.org/r/20250121162648.1408743-1-brahmajit.xyz@gmail.com Reviewed-by: Hans de Goede Signed-off-by: Christian Brauner --- fs/vboxsf/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index e95b8a48d8a02..1d94bb7841081 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,7 +21,8 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376', + '\375' }; static int follow_symlinks; module_param(follow_symlinks, int, 0444); -- GitLab From e52e97f09fb66fd868260d05bd6b74a9a3db39ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 30 Jan 2025 13:15:00 +0100 Subject: [PATCH 336/989] statmount: let unset strings be empty Just like it's normal for unset values to be zero, unset strings should be empty instead of containing random values. It seems to be a typical mistake that the mask returned by statmount is not checked, which can result in various bugs. With this fix, these bugs are prevented, since it is highly likely that userspace would just want to turn the missing mask case into an empty string anyway (most of the recently found cases are of this type). Link: https://lore.kernel.org/all/CAJfpegsVCPfCn2DpM8iiYSS5DpMsLB8QBUCHecoj6s0Vxf4jzg@mail.gmail.com/ Fixes: 68385d77c05b ("statmount: simplify string option retrieval") Fixes: 46eae99ef733 ("add statmount(2) syscall") Cc: stable@vger.kernel.org # v6.8 Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250130121500.113446-1-mszeredi@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a3ed3f2980cba..9c4d307a82cdf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5191,39 +5191,45 @@ static int statmount_string(struct kstatmount *s, u64 flag) size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; - u32 start = seq->count; + u32 start, *offp; + + /* Reserve an empty string at the beginning for any unset offsets */ + if (!seq->count) + seq_putc(seq, 0); + + start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = start; + offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = start; + offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = start; + offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = start; + offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: - sm->opt_array = start; + offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: - sm->opt_sec_array = start; + offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: - sm->fs_subtype = start; + offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: - sm->sb_source = start; + offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; default: @@ -5251,6 +5257,7 @@ static int statmount_string(struct kstatmount *s, u64 flag) seq->buf[seq->count++] = '\0'; sm->mask |= flag; + *offp = start; return 0; } -- GitLab From d9b3a3c70df2c2b87c83ca3f6e8ab49bd092fdbd Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 30 Jan 2025 14:56:21 +0100 Subject: [PATCH 337/989] gfs2: use lockref_init for gl_lockref Move the initialization of gl_lockref from gfs2_init_glock_once() to gfs2_glock_get(). This allows to use lockref_init() there. Reviewed-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20250130135624.1899988-2-agruenba@redhat.com Signed-off-by: Christian Brauner --- fs/gfs2/glock.c | 2 +- fs/gfs2/main.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8c4c1f871a889..b29eb71e3e29e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1201,8 +1201,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; + lockref_init(&gl->gl_lockref, 1); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); - gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 04cadc02e5a6e..0727f60ad0288 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -51,7 +51,6 @@ static void gfs2_init_glock_once(void *foo) { struct gfs2_glock *gl = foo; - spin_lock_init(&gl->gl_lockref.lock); INIT_LIST_HEAD(&gl->gl_holders); INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); -- GitLab From 34ad6fa2add2b38f2a89d28518de0142bff8fb43 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 30 Jan 2025 14:56:22 +0100 Subject: [PATCH 338/989] gfs2: switch to lockref_init(..., 1) In qd_alloc(), initialize the lockref count to 1 to cover the common case. Compensate for that in gfs2_quota_init() by adjusting the count back down to 0; this only occurs when mounting the filesystem rw. Reviewed-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20250130135624.1899988-3-agruenba@redhat.com Signed-off-by: Christian Brauner --- fs/gfs2/quota.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58bc5013ca49c..6ae529a5388bc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - lockref_init(&qd->qd_lockref, 0); + lockref_init(&qd->qd_lockref, 1); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); @@ -297,7 +297,6 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { - new_qd->qd_lockref.count++; *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); @@ -1450,6 +1449,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) if (qd == NULL) goto fail_brelse; + qd->qd_lockref.count = 0; set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; -- GitLab From bb504b4d64266fa0d7460c218c85afed371db03a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 30 Jan 2025 14:56:23 +0100 Subject: [PATCH 339/989] lockref: remove count argument of lockref_init All users of lockref_init() now initialize the count to 1, so hardcode that and remove the count argument. Reviewed-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20250130135624.1899988-4-agruenba@redhat.com Signed-off-by: Christian Brauner --- fs/dcache.c | 2 +- fs/erofs/zdata.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/quota.c | 2 +- include/linux/lockref.h | 7 ++++--- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9cc0d47da321c..7dee242b4195a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1700,7 +1700,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; - lockref_init(&dentry->d_lockref, 1); + lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 29f8963bb5232..d771e06db7386 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -726,7 +726,7 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - lockref_init(&pcl->lockref, 1); /* one ref for this request */ + lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b29eb71e3e29e..65c07aa957184 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1201,7 +1201,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; - lockref_init(&gl->gl_lockref, 1); + lockref_init(&gl->gl_lockref); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 6ae529a5388bc..2298e06797ac3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - lockref_init(&qd->qd_lockref, 1); + lockref_init(&qd->qd_lockref); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c39f119659ba4..676721ee878d7 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,12 +37,13 @@ struct lockref { /** * lockref_init - Initialize a lockref * @lockref: pointer to lockref structure - * @count: initial count + * + * Initializes @lockref->count to 1. */ -static inline void lockref_init(struct lockref *lockref, unsigned int count) +static inline void lockref_init(struct lockref *lockref) { spin_lock_init(&lockref->lock); - lockref->count = count; + lockref->count = 1; } void lockref_get(struct lockref *lockref); -- GitLab From 95101401bb50ae2cf9deee1bbf4d2b28d0dfdc26 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2025 23:32:03 +0100 Subject: [PATCH 340/989] fsnotify: use accessor to set FMODE_NONOTIFY_* The FMODE_NONOTIFY_* bits are a 2-bits mode. Open coding manipulation of those bits is risky. Use an accessor file_set_fsnotify_mode() to set the mode. Rename file_set_fsnotify_mode() => file_set_fsnotify_mode_from_watchers() to make way for the simple accessor name. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250203223205.861346-2-amir73il@gmail.com Signed-off-by: Christian Brauner --- drivers/tty/pty.c | 2 +- fs/notify/fsnotify.c | 18 ++++++++++++------ fs/open.c | 7 ++++--- include/linux/fs.h | 7 ++++++- include/linux/fsnotify.h | 4 ++-- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index df08f13052ff4..8bb1a01fef2a1 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -798,7 +798,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ - filp->f_mode |= FMODE_NONOTIFY; + file_set_fsnotify_mode(filp, FMODE_NONOTIFY); retval = tty_alloc_file(filp); if (retval) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8ee495a58d0ad..fae1b6d397ea0 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(fsnotify); * Later, fsnotify permission hooks do not check if there are permission event * watches, but that there were permission event watches at open time. */ -void file_set_fsnotify_mode(struct file *file) +void file_set_fsnotify_mode_from_watchers(struct file *file) { struct dentry *dentry = file->f_path.dentry, *parent; struct super_block *sb = dentry->d_sb; @@ -665,7 +665,7 @@ void file_set_fsnotify_mode(struct file *file) */ if (likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return; } @@ -676,7 +676,7 @@ void file_set_fsnotify_mode(struct file *file) if ((!d_is_dir(dentry) && !d_is_reg(dentry)) || likely(!fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_PRE_CONTENT))) { - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); return; } @@ -686,19 +686,25 @@ void file_set_fsnotify_mode(struct file *file) */ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask); if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask, - FSNOTIFY_PRE_CONTENT_EVENTS))) + FSNOTIFY_PRE_CONTENT_EVENTS))) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } /* Is parent watching for pre-content events on this file? */ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) { parent = dget_parent(dentry); p_mask = fsnotify_inode_watches_children(d_inode(parent)); dput(parent); - if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) + if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) { + /* Enable pre-content events */ + file_set_fsnotify_mode(file, 0); return; + } } /* Nobody watching for pre-content events from this file */ - file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM; + file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM); } #endif diff --git a/fs/open.c b/fs/open.c index 932e5a6de63bb..3fcbfff8aede8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -905,7 +905,8 @@ static int do_dentry_open(struct file *f, f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { - f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY; + f->f_mode = FMODE_PATH | FMODE_OPENED; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } @@ -938,7 +939,7 @@ static int do_dentry_open(struct file *f, * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't * change anything. */ - file_set_fsnotify_mode(f); + file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); if (error) goto cleanup_all; @@ -1122,7 +1123,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags, if (!IS_ERR(f)) { int error; - f->f_mode |= FMODE_NONOTIFY; + file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f7..7620547432a84 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,7 +222,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_FSNOTIFY_HSM(mode) 0 #endif - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -3140,6 +3139,12 @@ static inline void exe_file_allow_write_access(struct file *exe_file) allow_write_access(exe_file); } +static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) +{ + file->f_mode &= ~FMODE_FSNOTIFY_MASK; + file->f_mode |= mode; +} + static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784dd..6a33288bd6a1f 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -129,7 +129,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -void file_set_fsnotify_mode(struct file *file); +void file_set_fsnotify_mode_from_watchers(struct file *file); /* * fsnotify_file_area_perm - permission hook before access to file range @@ -213,7 +213,7 @@ static inline int fsnotify_open_perm(struct file *file) } #else -static inline void file_set_fsnotify_mode(struct file *file) +static inline void file_set_fsnotify_mode_from_watchers(struct file *file) { } -- GitLab From 5eb987105357cb7cfa7cf3b1e2f66d5c0977e412 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 16:12:53 +0100 Subject: [PATCH 341/989] fs: fix adding security options to statmount.mnt_opt Prepending security options was made conditional on sb->s_op->show_options, but security options are independent of sb options. Fixes: 056d33137bf9 ("fs: prepend statmount.mnt_opts string with security_sb_mnt_opts()") Fixes: f9af549d1fd3 ("fs: export mount options via statmount()") Cc: stable@vger.kernel.org # v6.11 Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129151253.33241-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/namespace.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9c4d307a82cdf..8f1000f9f3df1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5087,30 +5087,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; int err; - if (sb->s_op->show_options) { - size_t start = seq->count; - - err = security_sb_show_options(seq, sb); - if (err) - return err; + err = security_sb_show_options(seq, sb); + if (err) + return err; + if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; + } - if (unlikely(seq_has_overflowed(seq))) - return -EAGAIN; + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; - if (seq->count == start) - return 0; + if (seq->count == start) + return 0; - /* skip leading comma */ - memmove(seq->buf + start, seq->buf + start + 1, - seq->count - start - 1); - seq->count--; - } + /* skip leading comma */ + memmove(seq->buf + start, seq->buf + start + 1, + seq->count - start - 1); + seq->count--; return 0; } -- GitLab From 2a42754b3104d78a2bc7a2ad8844427411c76ca6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2025 23:32:04 +0100 Subject: [PATCH 342/989] fsnotify: disable notification by default for all pseudo files Most pseudo files are not applicable for fsnotify events at all, let alone to the new pre-content events. Disable notifications to all files allocated with alloc_file_pseudo() and enable legacy inotify events for the specific cases of pipe and socket, which have known users of inotify events. Pre-content events are also kept disabled for sockets and pipes. Fixes: 20bf82a898b6 ("mm: don't allow huge faults for files with pre content watches") Reported-by: Alex Williamson Closes: https://lore.kernel.org/linux-fsdevel/20250131121703.1e4d00a7.alex.williamson@redhat.com/ Suggested-by: Linus Torvalds Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wi2pThSVY=zhO=ZKxViBj5QCRX-=AS2+rVknQgJnHXDFg@mail.gmail.com/ Tested-by: Alex Williamson Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250203223205.861346-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/file_table.c | 11 +++++++++++ fs/open.c | 4 ++-- fs/pipe.c | 6 ++++++ net/socket.c | 5 +++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index f0291a66f9db4..35b93da6c5cb1 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -375,7 +375,13 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, if (IS_ERR(file)) { ihold(inode); path_put(&path); + return file; } + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); @@ -400,6 +406,11 @@ struct file *alloc_file_pseudo_noaccount(struct inode *inode, return file; } file_init_path(file, &path, fops); + /* + * Disable all fsnotify events for pseudo files by default. + * They may be enabled by caller with file_set_fsnotify_mode(). + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); diff --git a/fs/open.c b/fs/open.c index 3fcbfff8aede8..1be20de9f283a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -936,8 +936,8 @@ static int do_dentry_open(struct file *f, /* * Set FMODE_NONOTIFY_* bits according to existing permission watches. - * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't - * change anything. + * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a + * pseudo file, this call will not change the mode. */ file_set_fsnotify_mode_from_watchers(f); error = fsnotify_open_perm(f); diff --git a/fs/pipe.c b/fs/pipe.c index 94b59045ab44b..ce1af7592780d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -960,6 +960,12 @@ int create_pipe_files(struct file **res, int flags) res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM); + file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM); return 0; } diff --git a/net/socket.c b/net/socket.c index 262a28b59c7f0..28bae5a942341 100644 --- a/net/socket.c +++ b/net/socket.c @@ -479,6 +479,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return file; } EXPORT_SYMBOL(sock_alloc_file); -- GitLab From 2cc02059fbc79306b53a44b1f1a4444aa3c76598 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Jan 2025 17:06:41 +0100 Subject: [PATCH 343/989] selftests: always check mask returned by statmount(2) STATMOUNT_MNT_OPTS can actually be missing if there are no options. This is a change of behavior since 75ead69a7173 ("fs: don't let statmount return empty strings"). The other checks shouldn't actually trigger, but add them for correctness and for easier debugging if the test fails. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250129160641.35485-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- .../filesystems/statmount/statmount_test.c | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 8eb6aa606a0d5..46d289611ce86 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -383,6 +383,10 @@ static void test_statmount_mnt_point(void) return; } + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("missing STATMOUNT_MNT_POINT in mask\n"); + return; + } if (strcmp(sm->str + sm->mnt_point, "/") != 0) { ksft_test_result_fail("unexpected mount point: '%s' != '/'\n", sm->str + sm->mnt_point); @@ -408,6 +412,10 @@ static void test_statmount_mnt_root(void) strerror(errno)); return; } + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("missing STATMOUNT_MNT_ROOT in mask\n"); + return; + } mnt_root = sm->str + sm->mnt_root; last_root = strrchr(mnt_root, '/'); if (last_root) @@ -437,6 +445,10 @@ static void test_statmount_fs_type(void) strerror(errno)); return; } + if (!(sm->mask & STATMOUNT_FS_TYPE)) { + ksft_test_result_fail("missing STATMOUNT_FS_TYPE in mask\n"); + return; + } fs_type = sm->str + sm->fs_type; for (s = known_fs; s != NULL; s++) { if (strcmp(fs_type, *s) == 0) @@ -464,6 +476,11 @@ static void test_statmount_mnt_opts(void) return; } + if (!(sm->mask & STATMOUNT_MNT_BASIC)) { + ksft_test_result_fail("missing STATMOUNT_MNT_BASIC in mask\n"); + return; + } + while (getline(&line, &len, f_mountinfo) != -1) { int i; char *p, *p2; @@ -514,7 +531,10 @@ static void test_statmount_mnt_opts(void) if (p2) *p2 = '\0'; - statmount_opts = sm->str + sm->mnt_opts; + if (sm->mask & STATMOUNT_MNT_OPTS) + statmount_opts = sm->str + sm->mnt_opts; + else + statmount_opts = ""; if (strcmp(statmount_opts, p) != 0) ksft_test_result_fail( "unexpected mount options: '%s' != '%s'\n", -- GitLab From 711f9b8fbe4f4936302804e246e206f0829f628f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2025 23:32:05 +0100 Subject: [PATCH 344/989] fsnotify: disable pre-content and permission events by default After introducing pre-content events, we had a regression related to disabling huge faults on files that should never have pre-content events enabled. This happened because the default f_mode of allocated files (0) does not disable pre-content events. Pre-content events are disabled in file_set_fsnotify_mode_by_watchers() but internal files may not get to call this helper. Initialize f_mode to disable permission and pre-content events for all files and if needed they will be enabled for the callers of file_set_fsnotify_mode_by_watchers(). Fixes: 20bf82a898b6 ("mm: don't allow huge faults for files with pre content watches") Reported-by: Alex Williamson Closes: https://lore.kernel.org/linux-fsdevel/20250131121703.1e4d00a7.alex.williamson@redhat.com/ Tested-by: Alex Williamson Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250203223205.861346-4-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/file_table.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/file_table.c b/fs/file_table.c index 35b93da6c5cb1..5c00dc38558da 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -194,6 +194,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } -- GitLab From 091ee63e36e8289f9067f659a48d497911e49d6f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 14:51:20 +0100 Subject: [PATCH 345/989] pidfs: improve ioctl handling Pidfs supports extensible and non-extensible ioctls. The extensible ioctls need to check for the ioctl number itself not just the ioctl command otherwise both backward- and forward compatibility are broken. The pidfs ioctl handler also needs to look at the type of the ioctl command to guard against cases where "[...] a daemon receives some random file descriptor from a (potentially less privileged) client and expects the FD to be of some specific type, it might call ioctl() on this FD with some type-specific command and expect the call to fail if the FD is of the wrong type; but due to the missing type check, the kernel instead performs some action that userspace didn't expect." (cf. [1]] Link: https://lore.kernel.org/r/20250204-work-pidfs-ioctl-v1-1-04987d239575@kernel.org Link: https://lore.kernel.org/r/CAG48ez2K9A5GwtgqO31u9ZL292we8ZwAA=TJwwEv7wRuJ3j4Lw@mail.gmail.com [1] Fixes: 8ce352818820 ("pidfs: check for valid ioctl commands") Acked-by: Luca Boccassi Reported-by: Jann Horn Cc: stable@vger.kernel.org # v6.13; please backport with 8ce352818820 ("pidfs: check for valid ioctl commands") Signed-off-by: Christian Brauner --- fs/pidfs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 049352f973de3..63f9699ebac36 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -287,7 +287,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd) switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: - case PIDFD_GET_INFO: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: @@ -300,6 +299,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd) return true; } + /* Extensible ioctls require some more careful checks. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PIDFD_GET_INFO): + /* + * Try to prevent performing a pidfd ioctl when someone + * erronously mistook the file descriptor for a pidfd. + * This is not perfect but will catch most cases. + */ + return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + } + return false; } -- GitLab From 37d11cfc63604b3886308e2111d845d148ced8bc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Feb 2025 22:32:07 +0100 Subject: [PATCH 346/989] vfs: sanity check the length passed to inode_set_cached_link() This costs a strlen() call when instatianating a symlink. Preferably it would be hidden behind VFS_WARN_ON (or compatible), but there is no such facility at the moment. With the facility in place the call can be patched out in production kernels. In the meantime, since the cost is being paid unconditionally, use the result to a fixup the bad caller. This is not expected to persist in the long run (tm). Sample splat: bad length passed for symlink [/tmp/syz-imagegen43743633/file0/file0] (got 131109, expected 37) [rest of WARN blurp goes here] Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250204213207.337980-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7620547432a84..2c3b2f8a621f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -790,6 +790,19 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + int testlen; + + /* + * TODO: patch it into a debug-only check if relevant macros show up. + * In the meantime, since we are suffering strlen even on production kernels + * to find the right length, do a fixup if the wrong value got passed. + */ + testlen = strlen(link); + if (testlen != linklen) { + WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", + link, linklen, testlen); + linklen = testlen; + } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; -- GitLab From ca0f4fe7cf7183bfbdc67ca2de56ae1fc3a8db2b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 6 Feb 2025 10:21:38 -0700 Subject: [PATCH 347/989] arm64: Handle .ARM.attributes section in linker scripts A recent LLVM commit [1] started generating an .ARM.attributes section similar to the one that exists for 32-bit, which results in orphan section warnings (or errors if CONFIG_WERROR is enabled) from the linker because it is not handled in the arm64 linker scripts. ld.lld: error: arch/arm64/kernel/vdso/vgettimeofday.o:(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: arch/arm64/kernel/vdso/vgetrandom.o:(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/vsprintf.o):(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/win_minmax.o):(.ARM.attributes) is being placed in '.ARM.attributes' ld.lld: error: vmlinux.a(lib/xarray.o):(.ARM.attributes) is being placed in '.ARM.attributes' Discard the new sections in the necessary linker scripts to resolve the warnings, as the kernel and vDSO do not need to retain it, similar to the .note.gnu.property section. Cc: stable@vger.kernel.org Fixes: b3e5d80d0c48 ("arm64/build: Warn on orphan section placement") Link: https://github.com/llvm/llvm-project/commit/ee99c4d4845db66c4daa2373352133f4b237c942 [1] Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250206-arm64-handle-arm-attributes-in-linker-script-v3-1-d53d169913eb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso/vdso.lds.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 4ec32e86a8da2..47ad6944f9f08 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -41,6 +41,7 @@ SECTIONS */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) + *(.ARM.attributes) } .note : { *(.note.*) } :text :note diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index f84c71f04d9ea..e73326bd3ff7e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -162,6 +162,7 @@ SECTIONS /DISCARD/ : { *(.interp .dynamic) *(.dynsym .dynstr .hash .gnu.hash) + *(.ARM.attributes) } . = KIMAGE_VADDR; -- GitLab From 875d742cf5327c93cba1f11e12b08d3cce7a88d2 Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 6 Feb 2025 12:44:20 -0500 Subject: [PATCH 348/989] arm64: cacheinfo: Avoid out-of-bounds write to cacheinfo array The loop that detects/populates cache information already has a bounds check on the array size but does not account for cache levels with separate data/instructions cache. Fix this by incrementing the index for any populated leaf (instead of any populated level). Fixes: 5d425c186537 ("arm64: kernel: add support for cpu cache information") Signed-off-by: Radu Rendec Link: https://lore.kernel.org/r/20250206174420.2178724-1-rrendec@redhat.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cacheinfo.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c index d9c9218fa1fdd..309942b06c5bc 100644 --- a/arch/arm64/kernel/cacheinfo.c +++ b/arch/arm64/kernel/cacheinfo.c @@ -101,16 +101,18 @@ int populate_cache_leaves(unsigned int cpu) unsigned int level, idx; enum cache_type type; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct cacheinfo *infos = this_cpu_ci->info_list; for (idx = 0, level = 1; level <= this_cpu_ci->num_levels && - idx < this_cpu_ci->num_leaves; idx++, level++) { + idx < this_cpu_ci->num_leaves; level++) { type = get_cache_type(level); if (type == CACHE_TYPE_SEPARATE) { - ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); - ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); + if (idx + 1 >= this_cpu_ci->num_leaves) + break; + ci_leaf_init(&infos[idx++], CACHE_TYPE_DATA, level); + ci_leaf_init(&infos[idx++], CACHE_TYPE_INST, level); } else { - ci_leaf_init(this_leaf++, type, level); + ci_leaf_init(&infos[idx++], type, level); } } return 0; -- GitLab From 2813e00dcd748cef47d2bffaa04071de93fddf00 Mon Sep 17 00:00:00 2001 From: Ievgen Vovk Date: Sun, 12 Jan 2025 13:13:14 +0900 Subject: [PATCH 349/989] HID: hid-apple: Apple Magic Keyboard a3203 USB-C support Add Apple Magic Keyboard 2024 model (with USB-C port) device ID (0320) to those recognized by the hid-apple driver. Keyboard is otherwise compatible with the existing implementation for its earlier 2021 model. Signed-off-by: Ievgen Vovk Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 5 +++++ drivers/hid/hid-ids.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 7e1ae2a2bcc24..3c3f67d0bfcfe 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -474,6 +474,7 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2015) table = magic_keyboard_2015_fn_keys; else if (hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 || + hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 || hid->product == USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021) table = apple2021_fn_keys; @@ -1150,6 +1151,10 @@ static const struct hid_device_id apple_devices[] = { .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, + { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024), + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, + { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024), + .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), .driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021), diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index ed1d7f9e8caf4..7e400624908e3 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -184,6 +184,7 @@ #define USB_DEVICE_ID_APPLE_IRCONTROL4 0x8242 #define USB_DEVICE_ID_APPLE_IRCONTROL5 0x8243 #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021 0x029c +#define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2024 0x0320 #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021 0x029a #define USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021 0x029f #define USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT 0x8102 -- GitLab From 819083cb6eedcc8495cbf84845877bcc741b93b3 Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Thu, 16 Jan 2025 23:12:17 -0700 Subject: [PATCH 350/989] HID: apple: fix up the F6 key on the Omoton KB066 keyboard The Omoton KB066 is an Apple A1255 keyboard clone (HID product code 05ac:022c). On both keyboards, the F6 key becomes Num Lock when the Fn key is held. But unlike its Apple exemplar, when the Omoton's F6 key is pressed without Fn, it sends the usage code 0xC0301 from the reserved section of the consumer page instead of the standard F6 usage code 0x7003F from the keyboard page. The nonstandard code is translated to KEY_UNKNOWN and becomes useless on Linux. The Omoton KB066 is a pretty popular keyboard, judging from its 29,058 reviews on Amazon at time of writing, so let's account for its quirk to make it more usable. By the way, it would be nice if we could automatically set fnmode to 0 for Omoton keyboards because they handle the Fn key internally and the kernel's Fn key handling creates undesirable side effects such as making F1 and F2 always Brightness Up and Brightness Down in fnmode=1 (the default) or always F1 and F2 in fnmode=2. Unfortunately I don't think there's a way to identify Bluetooth keyboards more specifically than the HID product code which is obviously inaccurate. Users of Omoton keyboards will just have to set fnmode to 0 manually to get full Fn key functionality. Signed-off-by: Alex Henrie Signed-off-by: Jiri Kosina --- drivers/hid/hid-apple.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 3c3f67d0bfcfe..49812a76b7edd 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -546,6 +546,9 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } } + if (usage->hid == 0xc0301) /* Omoton KB066 quirk */ + code = KEY_F6; + if (usage->code != code) { input_event_with_scancode(input, usage->type, code, usage->hid, value); -- GitLab From 0b43d98ff29be3144e86294486b1373b5df74c0e Mon Sep 17 00:00:00 2001 From: Tulio Fernandes Date: Wed, 5 Feb 2025 18:50:34 -0300 Subject: [PATCH 351/989] HID: hid-thrustmaster: fix stack-out-of-bounds read in usb_check_int_endpoints() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Syzbot[1] has detected a stack-out-of-bounds read of the ep_addr array from hid-thrustmaster driver. This array is passed to usb_check_int_endpoints function from usb.c core driver, which executes a for loop that iterates over the elements of the passed array. Not finding a null element at the end of the array, it tries to read the next, non-existent element, crashing the kernel. To fix this, a 0 element was added at the end of the array to break the for loop. [1] https://syzkaller.appspot.com/bug?extid=9c9179ac46169c56c1ad Reported-by: syzbot+9c9179ac46169c56c1ad@syzkaller.appspotmail.com Fixes: 50420d7c79c3 ("HID: hid-thrustmaster: Fix warning in thrustmaster_probe by adding endpoint check") Signed-off-by: Túlio Fernandes Signed-off-by: Jiri Kosina --- drivers/hid/hid-thrustmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 6c3e758bbb09e..3b81468a1df29 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -171,7 +171,7 @@ static void thrustmaster_interrupts(struct hid_device *hdev) b_ep = ep->desc.bEndpointAddress; /* Are the expected endpoints present? */ - u8 ep_addr[1] = {b_ep}; + u8 ep_addr[2] = {b_ep, 0}; if (!usb_check_int_endpoints(usbif, ep_addr)) { hid_err(hdev, "Unexpected non-int endpoint\n"); -- GitLab From 79504249d7e27cad4a3eeb9afc6386e418728ce0 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Tue, 4 Feb 2025 19:55:27 -0800 Subject: [PATCH 352/989] HID: hid-steam: Move hidraw input (un)registering to work Due to an interplay between locking in the input and hid transport subsystems, attempting to register or deregister the relevant input devices during the hidraw open/close events can lead to a lock ordering issue. Though this shouldn't cause a deadlock, this commit moves the input device manipulation to deferred work to sidestep the issue. Fixes: 385a4886778f6 ("HID: steam: remove input device when a hid client is running.") Signed-off-by: Vicki Pfau Signed-off-by: Jiri Kosina --- drivers/hid/hid-steam.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index b008fd0834b94..5a17714fedea0 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -313,6 +313,7 @@ struct steam_device { u16 rumble_left; u16 rumble_right; unsigned int sensor_timestamp_us; + struct work_struct unregister_work; }; static int steam_recv_report(struct steam_device *steam, @@ -1072,6 +1073,31 @@ static void steam_mode_switch_cb(struct work_struct *work) } } +static void steam_work_unregister_cb(struct work_struct *work) +{ + struct steam_device *steam = container_of(work, struct steam_device, + unregister_work); + unsigned long flags; + bool connected; + bool opened; + + spin_lock_irqsave(&steam->lock, flags); + opened = steam->client_opened; + connected = steam->connected; + spin_unlock_irqrestore(&steam->lock, flags); + + if (connected) { + if (opened) { + steam_sensors_unregister(steam); + steam_input_unregister(steam); + } else { + steam_set_lizard_mode(steam, lizard_mode); + steam_input_register(steam); + steam_sensors_register(steam); + } + } +} + static bool steam_is_valve_interface(struct hid_device *hdev) { struct hid_report_enum *rep_enum; @@ -1117,8 +1143,7 @@ static int steam_client_ll_open(struct hid_device *hdev) steam->client_opened++; spin_unlock_irqrestore(&steam->lock, flags); - steam_sensors_unregister(steam); - steam_input_unregister(steam); + schedule_work(&steam->unregister_work); return 0; } @@ -1135,11 +1160,7 @@ static void steam_client_ll_close(struct hid_device *hdev) connected = steam->connected && !steam->client_opened; spin_unlock_irqrestore(&steam->lock, flags); - if (connected) { - steam_set_lizard_mode(steam, lizard_mode); - steam_input_register(steam); - steam_sensors_register(steam); - } + schedule_work(&steam->unregister_work); } static int steam_client_ll_raw_request(struct hid_device *hdev, @@ -1231,6 +1252,7 @@ static int steam_probe(struct hid_device *hdev, INIT_LIST_HEAD(&steam->list); INIT_WORK(&steam->rumble_work, steam_haptic_rumble_cb); steam->sensor_timestamp_us = 0; + INIT_WORK(&steam->unregister_work, steam_work_unregister_cb); /* * With the real steam controller interface, do not connect hidraw. @@ -1291,6 +1313,7 @@ static int steam_probe(struct hid_device *hdev, cancel_work_sync(&steam->work_connect); cancel_delayed_work_sync(&steam->mode_switch); cancel_work_sync(&steam->rumble_work); + cancel_work_sync(&steam->unregister_work); return ret; } @@ -1307,6 +1330,7 @@ static void steam_remove(struct hid_device *hdev) cancel_delayed_work_sync(&steam->mode_switch); cancel_work_sync(&steam->work_connect); cancel_work_sync(&steam->rumble_work); + cancel_work_sync(&steam->unregister_work); hid_destroy_device(steam->client_hdev); steam->client_hdev = NULL; steam->client_opened = 0; -- GitLab From b051ffa2aeb2a60e092387b6fb2af1ad42f51a3c Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Tue, 4 Feb 2025 19:55:29 -0800 Subject: [PATCH 353/989] HID: hid-steam: Don't use cancel_delayed_work_sync in IRQ context Lockdep reported that, as steam_do_deck_input_event is called from steam_raw_event inside of an IRQ context, it can lead to issues if that IRQ occurs while the work to be cancelled is running. By using cancel_delayed_work, this issue can be avoided. The exact ordering of the work and the event processing is not super important, so this is safe. Fixes: cd438e57dd05 ("HID: hid-steam: Add gamepad-only mode switched to by holding options") Signed-off-by: Vicki Pfau Signed-off-by: Jiri Kosina --- drivers/hid/hid-steam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index 5a17714fedea0..c9e65e9088b31 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -1617,7 +1617,7 @@ static void steam_do_deck_input_event(struct steam_device *steam, if (!(b9 & BIT(6)) && steam->did_mode_switch) { steam->did_mode_switch = false; - cancel_delayed_work_sync(&steam->mode_switch); + cancel_delayed_work(&steam->mode_switch); } else if (!steam->client_opened && (b9 & BIT(6)) && !steam->did_mode_switch) { steam->did_mode_switch = true; schedule_delayed_work(&steam->mode_switch, 45 * HZ / 100); -- GitLab From 02458fbfaa0170aabf8506f7d4ed054f02414251 Mon Sep 17 00:00:00 2001 From: Rupinderjit Singh Date: Thu, 6 Feb 2025 15:58:03 +0000 Subject: [PATCH 354/989] gpu: host1x: Fix a use of uninitialized mutex commit c8347f915e67 ("gpu: host1x: Fix boot regression for Tegra") caused a use of uninitialized mutex leading to below warning when CONFIG_DEBUG_MUTEXES and CONFIG_DEBUG_LOCK_ALLOC are enabled. [ 41.662843] ------------[ cut here ]------------ [ 41.663012] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 41.663035] WARNING: CPU: 4 PID: 794 at kernel/locking/mutex.c:587 __mutex_lock+0x670/0x878 [ 41.663458] Modules linked in: rtw88_8822c(+) bluetooth(+) rtw88_pci rtw88_core mac80211 aquantia libarc4 crc_itu_t cfg80211 tegra194_cpufreq dwmac_tegra(+) arm_dsu_pmu stmmac_platform stmmac pcs_xpcs rfkill at24 host1x(+) tegra_bpmp_thermal ramoops reed_solomon fuse loop nfnetlink xfs mmc_block rpmb_core ucsi_ccg ina3221 crct10dif_ce xhci_tegra ghash_ce lm90 sha2_ce sha256_arm64 sha1_ce sdhci_tegra pwm_fan sdhci_pltfm sdhci gpio_keys rtc_tegra cqhci mmc_core phy_tegra_xusb i2c_tegra tegra186_gpc_dma i2c_tegra_bpmp spi_tegra114 dm_mirror dm_region_hash dm_log dm_mod [ 41.665078] CPU: 4 UID: 0 PID: 794 Comm: (udev-worker) Not tainted 6.11.0-29.31_1538613708.el10.aarch64+debug #1 [ 41.665838] Hardware name: NVIDIA NVIDIA Jetson AGX Orin Developer Kit/Jetson, BIOS 36.3.0-gcid-35594366 02/26/2024 [ 41.672555] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 41.679636] pc : __mutex_lock+0x670/0x878 [ 41.683834] lr : __mutex_lock+0x670/0x878 [ 41.688035] sp : ffff800084b77090 [ 41.691446] x29: ffff800084b77160 x28: ffffdd4bebf7b000 x27: ffffdd4be96b1000 [ 41.698799] x26: 1fffe0002308361c x25: 1ffff0001096ee18 x24: 0000000000000000 [ 41.706149] x23: 0000000000000000 x22: 0000000000000002 x21: ffffdd4be6e3c7a0 [ 41.713500] x20: ffff800084b770f0 x19: ffff00011841b1e8 x18: 0000000000000000 [ 41.720675] x17: 0000000000000000 x16: 0000000000000000 x15: 0720072007200720 [ 41.728023] x14: 0000000000000000 x13: 0000000000000001 x12: ffff6001a96eaab3 [ 41.735375] x11: 1fffe001a96eaab2 x10: ffff6001a96eaab2 x9 : ffffdd4be4838bbc [ 41.742723] x8 : 00009ffe5691554e x7 : ffff000d4b755593 x6 : 0000000000000001 [ 41.749985] x5 : ffff000d4b755590 x4 : 1fffe0001d88f001 x3 : dfff800000000000 [ 41.756988] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000ec478000 [ 41.764251] Call trace: [ 41.766695] __mutex_lock+0x670/0x878 [ 41.770373] mutex_lock_nested+0x2c/0x40 [ 41.774134] host1x_intr_start+0x54/0xf8 [host1x] [ 41.778863] host1x_runtime_resume+0x150/0x228 [host1x] [ 41.783935] pm_generic_runtime_resume+0x84/0xc8 [ 41.788485] __rpm_callback+0xa0/0x478 [ 41.792422] rpm_callback+0x15c/0x1a8 [ 41.795922] rpm_resume+0x698/0xc08 [ 41.799597] __pm_runtime_resume+0xa8/0x140 [ 41.803621] host1x_probe+0x810/0xbc0 [host1x] [ 41.807909] platform_probe+0xcc/0x1a8 [ 41.811845] really_probe+0x188/0x800 [ 41.815347] __driver_probe_device+0x164/0x360 [ 41.819810] driver_probe_device+0x64/0x1a8 [ 41.823834] __driver_attach+0x180/0x490 [ 41.827773] bus_for_each_dev+0x104/0x1a0 [ 41.831797] driver_attach+0x44/0x68 [ 41.835296] bus_add_driver+0x23c/0x4e8 [ 41.839235] driver_register+0x15c/0x3a8 [ 41.843170] __platform_register_drivers+0xa4/0x208 [ 41.848159] tegra_host1x_init+0x4c/0xff8 [host1x] [ 41.853147] do_one_initcall+0xd4/0x380 [ 41.856997] do_init_module+0x1dc/0x698 [ 41.860758] load_module+0xc70/0x1300 [ 41.864435] __do_sys_init_module+0x1a8/0x1d0 [ 41.868721] __arm64_sys_init_module+0x74/0xb0 [ 41.873183] invoke_syscall.constprop.0+0xdc/0x1e8 [ 41.877997] do_el0_svc+0x154/0x1d0 [ 41.881671] el0_svc+0x54/0x140 [ 41.884820] el0t_64_sync_handler+0x120/0x130 [ 41.889285] el0t_64_sync+0x1a4/0x1a8 [ 41.892960] irq event stamp: 69737 [ 41.896370] hardirqs last enabled at (69737): [] _raw_spin_unlock_irqrestore+0x44/0xe8 [ 41.905739] hardirqs last disabled at (69736): [] clk_enable_lock+0x98/0x198 [ 41.914314] softirqs last enabled at (68082): [] handle_softirqs+0x4c8/0x890 [ 41.922977] softirqs last disabled at (67945): [] __do_softirq+0x1c/0x28 [ 41.931289] ---[ end trace 0000000000000000 ]--- Inside the probe function when pm_runtime_enable() is called, the PM core invokes a resume callback if the device Host1x is in a suspended state. As it can be seen in the logs above, this leads to host1x_intr_start() function call which is trying to acquire a mutex lock. But, the function host_intr_init() only gets called after the pm_runtime_enable() where mutex is initialised leading to the use of mutex prior to its initialisation. Fix this by moving the mutex initialisation prior to the runtime PM enablement function pm_runtime_enable() in probe. Fixes: c8347f915e67 ("gpu: host1x: Fix boot regression for Tegra") Signed-off-by: Rupinderjit Singh Reviewed-by: Jon Hunter Tested-by: Jon Hunter Signed-off-by: Thierry Reding Link: https://patchwork.ozlabs.org/project/linux-tegra/patch/20250206155803.201942-1-rusingh@redhat.com/ --- drivers/gpu/host1x/dev.c | 2 ++ drivers/gpu/host1x/intr.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 7b1d091f3c090..46cae925b0959 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -619,6 +619,8 @@ static int host1x_probe(struct platform_device *pdev) goto free_contexts; } + mutex_init(&host->intr_mutex); + pm_runtime_enable(&pdev->dev); err = devm_tegra_core_dev_init_opp_table_common(&pdev->dev); diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c index b3285dd101804..f77a678949e96 100644 --- a/drivers/gpu/host1x/intr.c +++ b/drivers/gpu/host1x/intr.c @@ -104,8 +104,6 @@ int host1x_intr_init(struct host1x *host) unsigned int id; int i, err; - mutex_init(&host->intr_mutex); - for (id = 0; id < host1x_syncpt_nb_pts(host); ++id) { struct host1x_syncpt *syncpt = &host->syncpt[id]; -- GitLab From 3b32b7f638fe61e9d29290960172f4e360e38233 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Sun, 19 Jan 2025 10:58:29 +0800 Subject: [PATCH 355/989] drm/panthor: avoid garbage value in panthor_ioctl_dev_query() 'priorities_info' is uninitialized, and the uninitialized value is copied to user object when calling PANTHOR_UOBJ_SET(). Using memset to initialize 'priorities_info' to avoid this garbage value problem. Fixes: f70000ef2352 ("drm/panthor: Add DEV_QUERY_GROUP_PRIORITIES_INFO dev query") Signed-off-by: Su Hui Reviewed-by: Dan Carpenter Reviewed-by: Boris Brezillon Reviewed-by: Steven Price Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20250119025828.1168419-1-suhui@nfschina.com --- drivers/gpu/drm/panthor/panthor_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index d5dcd3d1b33a0..08136e790ca0a 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -802,6 +802,7 @@ static void panthor_query_group_priorities_info(struct drm_file *file, { int prio; + memset(arg, 0, sizeof(*arg)); for (prio = PANTHOR_GROUP_PRIORITY_REALTIME; prio >= 0; prio--) { if (!group_priority_permit(file, prio)) arg->allowed_mask |= BIT(prio); -- GitLab From 511121a48bbd12df4ae50a099a8936e833df8c46 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 5 Feb 2025 19:42:01 +0100 Subject: [PATCH 356/989] MAINTAINERS: Move Pavel to kernel.org address I need to filter my emails better, switch to pavel@kernel.org address to help with that. Signed-off-by: Pavel Machek Signed-off-by: Linus Torvalds --- CREDITS | 6 ++---- MAINTAINERS | 10 +++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/CREDITS b/CREDITS index 1f9f0f078b4ae..53d11a46fd698 100644 --- a/CREDITS +++ b/CREDITS @@ -2515,11 +2515,9 @@ D: SLS distribution D: Initial implementation of VC's, pty's and select() N: Pavel Machek -E: pavel@ucw.cz +E: pavel@kernel.org P: 4096R/92DFCE96 4FA7 9EEF FCD4 C44F C585 B8C7 C060 2241 92DF CE96 -D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd, -D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB, -D: work on suspend-to-ram/disk, killing duplicates from ioctl32, +D: NBD, Sun4/330 port, USB, work on suspend-to-ram/disk, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic diff --git a/MAINTAINERS b/MAINTAINERS index 873aa2cce4d7f..157818de0b554 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9418,7 +9418,7 @@ F: fs/freevxfs/ FREEZER M: "Rafael J. Wysocki" -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported F: Documentation/power/freezing-of-tasks.rst @@ -10253,7 +10253,7 @@ F: drivers/video/fbdev/hgafb.c HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org @@ -13124,8 +13124,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: scripts/leaking_addresses.pl LED SUBSYSTEM -M: Pavel Machek M: Lee Jones +M: Pavel Machek L: linux-leds@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git @@ -16823,7 +16823,7 @@ F: include/linux/tick.h F: kernel/time/tick*.* NOKIA N900 CAMERA SUPPORT (ET8EK8 SENSOR, AD5820 FOCUS) -M: Pavel Machek +M: Pavel Machek M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained @@ -22849,7 +22849,7 @@ F: drivers/sh/ SUSPEND TO RAM M: "Rafael J. Wysocki" M: Len Brown -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org -- GitLab From 1b3291f00013c86a9bb349d6158a9a7a4f0334fe Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 7 Feb 2025 03:21:46 +0900 Subject: [PATCH 357/989] MAINTAINERS: Remove myself I no longer have any faith left in the kernel development process or community management approach. Apple/ARM platform development will continue downstream. If I feel like sending some patches upstream in the future myself for whatever subtree I may, or I may not. Anyone who feels like fighting the upstreaming fight themselves is welcome to do so. Signed-off-by: Hector Martin Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 157818de0b554..20c9e08712150 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2209,7 +2209,6 @@ F: sound/soc/codecs/cs42l84.* F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT -M: Hector Martin M: Sven Peter R: Alyssa Rosenzweig L: asahi@lists.linux.dev -- GitLab From f354fc88a72ae83dacd68370f6fa040e5733bcfe Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 7 Feb 2025 15:08:55 +0800 Subject: [PATCH 358/989] kbuild: install-extmod-build: add missing quotation marks for CC variable While attempting to build a Debian packages with CC="ccache gcc", I saw the following error as builddeb builds linux-headers-$KERNELVERSION: make HOSTCC=ccache gcc VPATH= srcroot=. -f ./scripts/Makefile.build obj=debian/linux-headers-6.14.0-rc1/usr/src/linux-headers-6.14.0-rc1/scripts make[6]: *** No rule to make target 'gcc'. Stop. Upon investigation, it seems that one instance of $(CC) variable reference in ./scripts/package/install-extmod-build was missing quotation marks, causing the above error. Add the missing quotation marks around $(CC) to fix build. Fixes: 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package") Co-developed-by: Mingcong Bai Signed-off-by: Mingcong Bai Tested-by: WangYuli Signed-off-by: WangYuli Signed-off-by: Masahiro Yamada --- scripts/package/install-extmod-build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index bb6e23c1174ec..b724626ea0ca0 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -63,7 +63,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then # Clear VPATH and srcroot because the source files reside in the output # directory. # shellcheck disable=SC2016 # $(MAKE), $(CC), and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC=$(CC) VPATH= srcroot=. $(build)='"${destdir}"/scripts + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC="$(CC)" VPATH= srcroot=. $(build)='"${destdir}"/scripts rm -f "${destdir}/scripts/Kbuild" fi -- GitLab From 595170d4b660640289003d1881a9a6ef8ded6865 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Feb 2025 22:28:41 -0500 Subject: [PATCH 359/989] bcachefs: Fix marking reflink pointers to missing indirect extents reflink pointers to missing indirect extents aren't deleted, they just have an error bit set - in case the indirect extent somehow reappears. fsck/mark and sweep thus needs to ignore these errors. Also, they can be marked AUTOFIX now. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 2 ++ fs/bcachefs/sb-errors_format.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 93ba4f4e47cae..376fd0a6e868c 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -381,6 +381,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); + if (ret == -BCH_ERR_missing_indirect_extent) + ret = 0; if (ret) goto err; } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index ea0a18364751d..b86ec013d7d70 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,9 +180,9 @@ enum bch_fsck_flags { x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ -- GitLab From 01af106a076352182b2916b143fc50272600bd81 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 21 Jan 2025 05:40:51 +0000 Subject: [PATCH 360/989] btrfs: fix two misuses of folio_shift() It is meaningless to shift a byte count by folio_shift(). The folio index is in units of PAGE_SIZE, not folio_size(). We can use folio_contains() to make this work for arbitrary-order folios, so remove the assertion that the folios are of order 0. Reviewed-by: Qu Wenruo Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d9f856358704f..6f64ee16744d6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -523,8 +523,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) u64 end; u32 len; - /* For now only order 0 folios are supported for data. */ - ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, @@ -552,7 +550,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles @@ -561,9 +558,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) * Here we should only zero the range inside the folio, * not touch anything else. * - * NOTE: i_size is exclusive while end is inclusive. + * NOTE: i_size is exclusive while end is inclusive and + * folio_contains() takes PAGE_SIZE units. */ - if (folio_index(folio) == end_index && i_size <= end) { + if (folio_contains(folio, i_size >> PAGE_SHIFT) && + i_size <= end) { u32 zero_start = max(offset_in_folio(folio, i_size), offset_in_folio(folio, start)); u32 zero_len = offset_in_folio(folio, end) + 1 - @@ -956,7 +955,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return ret; } - if (folio->index == last_byte >> folio_shift(folio)) { + if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { -- GitLab From cb827db50a88aebec516151681adb6db10b688ee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Feb 2025 08:30:51 +0000 Subject: [PATCH 361/989] net: fib_rules: annotate data-races around rule->[io]ifindex rule->iifindex and rule->oifindex can be read without holding RTNL. Add READ_ONCE()/WRITE_ONCE() annotations where needed. Fixes: 32affa5578f0 ("fib: rules: no longer hold RTNL in fib_nl_dumprule()") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250206083051.2494877-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/fib_rules.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e684ba3ebb385..94a7872ab2318 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -37,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -261,12 +261,14 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && (iifindex != fl->flowi_iif)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && (oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -1041,14 +1043,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1220,10 +1222,10 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + WRITE_ONCE(rule->iifindex, dev->ifindex); if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + WRITE_ONCE(rule->oifindex, dev->ifindex); } } @@ -1233,9 +1235,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; + WRITE_ONCE(rule->iifindex, -1); if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + WRITE_ONCE(rule->oifindex, -1); } } -- GitLab From db5fd3cf8bf41b84b577b8ad5234ea95f327c9be Mon Sep 17 00:00:00 2001 From: Muhammad Adeel Date: Fri, 7 Feb 2025 14:24:32 +0000 Subject: [PATCH 362/989] cgroup: Remove steal time from usage_usec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU usage time is the time when user, system or both are using the CPU. Steal time is the time when CPU is waiting to be run by the Hypervisor. It should not be added to the CPU usage time, hence removing it from the usage_usec entry. Fixes: 936f2a70f2077 ("cgroup: add cpu.stat file to root cgroup") Acked-by: Axel Busch Acked-by: Michal Koutný Signed-off-by: Muhammad Adeel Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92c..aac91466279f1 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; -- GitLab From 884c3a18dadfda326dffa364477cc027728219de Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 4 Feb 2025 10:25:16 -0700 Subject: [PATCH 363/989] bpf: verifier: Do not extract constant map keys for irrelevant maps Previously, we were trying to extract constant map keys for all bpf_map_lookup_elem(), regardless of map type. This is an issue if the map has a u64 key and the value is very high, as it can be interpreted as a negative signed value. This in turn is treated as an error value by check_func_arg() which causes a valid program to be incorrectly rejected. Fix by only extracting constant map keys for relevant maps. This fix works because nullness elision is only allowed for {PERCPU_}ARRAY maps, and keys for these are within u32 range. See next commit for an example via selftest. Acked-by: Eduard Zingerman Reported-by: Marc Hartmayer Reported-by: Ilya Leoshkevich Tested-by: Marc Hartmayer Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/aa868b642b026ff87ba6105ea151bc8693b35932.1738689872.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9971c03adfd5d..e9176a5ce215e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9206,6 +9206,8 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, return reg->var_off.value; } +static bool can_elide_value_nullness(enum bpf_map_type type); + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -9354,9 +9356,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - meta->const_map_key = get_constant_map_key(env, reg, key_size); - if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) - return meta->const_map_key; + if (can_elide_value_nullness(meta->map_ptr->map_type)) { + meta->const_map_key = get_constant_map_key(env, reg, key_size); + if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) + return meta->const_map_key; + } break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) -- GitLab From 973cb1382ead401c476c82f20525e593ae84788f Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 4 Feb 2025 10:25:17 -0700 Subject: [PATCH 364/989] bpf: selftests: Test constant key extraction on irrelevant maps Test that very high constant map keys are not interpreted as an error value by the verifier. This would previously fail. Acked-by: Eduard Zingerman Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/c0590b62eb9303f389b2f52c0c7e9cf22a358a30.1738689872.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_array_access.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_array_access.c b/tools/testing/selftests/bpf/progs/verifier_array_access.c index 29eb9568633ff..0a187ff725cc4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_array_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_array_access.c @@ -713,4 +713,19 @@ unsigned int non_stack_key_lookup(void) return val->index; } +SEC("socket") +__description("doesn't reject UINT64_MAX as s64 for irrelevant maps") +__success __retval(42) +unsigned int doesnt_reject_irrelevant_maps(void) +{ + __u64 key = 0xFFFFFFFFFFFFFFFF; + struct test_val *val; + + val = bpf_map_lookup_elem(&map_hash_48b, &key); + if (val) + return val->index; + + return 42; +} + char _license[] SEC("license") = "GPL"; -- GitLab From 7968c6581507052c1c6484ee6c5cbe07381e2dbc Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 4 Feb 2025 10:25:18 -0700 Subject: [PATCH 365/989] bpf: verifier: Disambiguate get_constant_map_key() errors Refactor get_constant_map_key() to disambiguate the constant key value from potential error values. In the case that the key is negative, it could be confused for an error. It's not currently an issue, as the verifier seems to track s32 spills as u32. So even if the program wrongly uses a negative value for an arraymap key, the verifier just thinks it's an impossibly high value which gets correctly discarded. Refactor anyways to make things cleaner and prevent potential future issues. Acked-by: Eduard Zingerman Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/dfe144259ae7cfc98aa63e1b388a14869a10632a.1738689872.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9176a5ce215e..98354d7816789 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9149,10 +9149,11 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return 0; } -/* Returns constant key value if possible, else negative error */ -static s64 get_constant_map_key(struct bpf_verifier_env *env, +/* Returns constant key value in `value` if possible, else negative error */ +static int get_constant_map_key(struct bpf_verifier_env *env, struct bpf_reg_state *key, - u32 key_size) + u32 key_size, + s64 *value) { struct bpf_func_state *state = func(env, key); struct bpf_reg_state *reg; @@ -9179,8 +9180,10 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, /* First handle precisely tracked STACK_ZERO */ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--) zero_size++; - if (zero_size >= key_size) + if (zero_size >= key_size) { + *value = 0; return 0; + } /* Check that stack contains a scalar spill of expected size */ if (!is_spilled_scalar_reg(&state->stack[spi])) @@ -9203,7 +9206,8 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, if (err < 0) return err; - return reg->var_off.value; + *value = reg->var_off.value; + return 0; } static bool can_elide_value_nullness(enum bpf_map_type type); @@ -9357,9 +9361,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; if (can_elide_value_nullness(meta->map_ptr->map_type)) { - meta->const_map_key = get_constant_map_key(env, reg, key_size); - if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) - return meta->const_map_key; + err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); + if (err < 0) { + meta->const_map_key = -1; + if (err == -EOPNOTSUPP) + err = 0; + else + return err; + } } break; case ARG_PTR_TO_MAP_VALUE: -- GitLab From 011b0335903832facca86cd8ed05d7d8d94c9c76 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 Feb 2025 22:28:48 +0100 Subject: [PATCH 366/989] Revert "net: skb: introduce and use a single page frag cache" This reverts commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"). The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to the introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - net/core/dev.c | 17 +++++++ net/core/skbuff.c | 103 ++------------------------------------ 3 files changed, 22 insertions(+), 99 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85daa..365f0e2098d13 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,7 +4115,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index b91658e8aedb4..55e356a68db66 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6920,6 +6920,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c1..6a99c453397fc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -220,67 +220,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -290,23 +232,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -813,10 +738,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(1024) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -826,32 +749,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- GitLab From 8784714d7f27045c7cb72456cf66705b73fbc804 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 6 Feb 2025 02:54:31 -0800 Subject: [PATCH 367/989] bpf: Handle allocation failure in acquire_lock_state The acquire_lock_state function needs to handle possible NULL values returned by acquire_reference_state, and return -ENOMEM. Fixes: 769b0f1c8214 ("bpf: Refactor {acquire,release}_reference_state") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250206105435.2159977-24-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98354d7816789..60611df77957a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1501,6 +1501,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r struct bpf_reference_state *s; s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; s->type = type; s->id = id; s->ptr = ptr; -- GitLab From 8f6629c004b193d23612641c3607e785819e97ab Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 17 Oct 2024 10:09:22 -0700 Subject: [PATCH 368/989] kbuild: Move -Wenum-enum-conversion to W=2 -Wenum-enum-conversion was strengthened in clang-19 to warn for C, which caused the kernel to move it to W=1 in commit 75b5ab134bb5 ("kbuild: Move -Wenum-{compare-conditional,enum-conversion} into W=1") because there were numerous instances that would break builds with -Werror. Unfortunately, this is not a full solution, as more and more developers, subsystems, and distributors are building with W=1 as well, so they continue to see the numerous instances of this warning. Since the move to W=1, there have not been many new instances that have appeared through various build reports and the ones that have appeared seem to be following similar existing patterns, suggesting that most instances of this warning will not be real issues. The only alternatives for silencing this warning are adding casts (which is generally seen as an ugly practice) or refactoring the enums to macro defines or a unified enum (which may be undesirable because of type safety in other parts of the code). Move the warning to W=2, where warnings that occur frequently but may be relevant should reside. Cc: stable@vger.kernel.org Fixes: 75b5ab134bb5 ("kbuild: Move -Wenum-{compare-conditional,enum-conversion} into W=1") Link: https://lore.kernel.org/ZwRA9SOcOjjLJcpi@google.com/ Signed-off-by: Nathan Chancellor Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- scripts/Makefile.extrawarn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index eb719f6d8d536..a7003c1e66c7d 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -133,7 +133,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) KBUILD_CFLAGS += -Wno-enum-compare-conditional -KBUILD_CFLAGS += -Wno-enum-enum-conversion endif endif @@ -157,6 +156,10 @@ KBUILD_CFLAGS += -Wno-missing-field-initializers KBUILD_CFLAGS += -Wno-type-limits KBUILD_CFLAGS += -Wno-shift-negative-value +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-enum-enum-conversion +endif + ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif -- GitLab From d0b197b6505fe3788860fc2a81b3ce53cbecc69c Mon Sep 17 00:00:00 2001 From: Reyders Morales Date: Mon, 3 Feb 2025 23:47:20 +0100 Subject: [PATCH 369/989] Documentation/networking: fix basic node example document ISO 15765-2 In the current struct sockaddr_can tp is member of can_addr. tp is not member of struct sockaddr_can. Signed-off-by: Reyders Morales Reviewed-by: Simon Horman Acked-by: Oliver Hartkopp Link: https://patch.msgid.link/20250203224720.42530-1-reyders1@gmail.com Fixes: 67711e04254c ("Documentation: networking: document ISO 15765-2") Signed-off-by: Marc Kleine-Budde --- Documentation/networking/iso15765-2.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/iso15765-2.rst b/Documentation/networking/iso15765-2.rst index 0e9d960741783..37ebb2c417cb4 100644 --- a/Documentation/networking/iso15765-2.rst +++ b/Documentation/networking/iso15765-2.rst @@ -369,8 +369,8 @@ to their default. addr.can_family = AF_CAN; addr.can_ifindex = if_nametoindex("can0"); - addr.tp.tx_id = 0x18DA42F1 | CAN_EFF_FLAG; - addr.tp.rx_id = 0x18DAF142 | CAN_EFF_FLAG; + addr.can_addr.tp.tx_id = 0x18DA42F1 | CAN_EFF_FLAG; + addr.can_addr.tp.rx_id = 0x18DAF142 | CAN_EFF_FLAG; ret = bind(s, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) -- GitLab From 44de577e61ed239db09f0da9d436866bef9b77dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=B6lzl?= Date: Wed, 5 Feb 2025 18:46:51 +0100 Subject: [PATCH 370/989] can: j1939: j1939_sk_send_loop(): fix unable to send messages with data length zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The J1939 standard requires the transmission of messages of length 0. For example proprietary messages are specified with a data length of 0 to 1785. The transmission of such messages is not possible. Sending results in no error being returned but no corresponding can frame being generated. Enable the transmission of zero length J1939 messages. In order to facilitate this two changes are necessary: 1) If the transmission of a new message is requested from user space the message is segmented in j1939_sk_send_loop(). Let the segmentation take into account zero length messages, do not terminate immediately, queue the corresponding skb. 2) j1939_session_skb_get_by_offset() selects the next skb to transmit for a session. Take into account that there might be zero length skbs in the queue. Signed-off-by: Alexander Hölzl Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250205174651.103238-1-alexander.hoelzl@gmx.net Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: stable@vger.kernel.org [mkl: commit message rephrased] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 4 ++-- net/can/j1939/transport.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 305dd72c844c7..17226b2341d03 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1132,7 +1132,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size = size; - while (todo_size) { + do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, @@ -1177,7 +1177,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size -= segment_size; session->total_queued_size += segment_size; - } + } while (todo_size); switch (ret) { case 0: /* OK */ diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 95f7a7e65a73f..9b72d118d756d 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -382,8 +382,9 @@ sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); - if (offset_start >= do_skcb->offset && - offset_start < (do_skcb->offset + do_skb->len)) { + if ((offset_start >= do_skcb->offset && + offset_start < (do_skcb->offset + do_skb->len)) || + (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) { skb = do_skb; } } -- GitLab From 9bd24927e3eeb85642c7baa3b28be8bea6c2a078 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Tue, 14 Jan 2025 18:21:38 +0300 Subject: [PATCH 371/989] can: ctucanfd: handle skb allocation failure If skb allocation fails, the pointer to struct can_frame is NULL. This is actually handled everywhere inside ctucan_err_interrupt() except for the only place. Add the missed NULL check. Found by Linux Verification Center (linuxtesting.org) with SVACE static analysis tool. Fixes: 2dcb8e8782d8 ("can: ctucanfd: add support for CTU CAN FD open-source IP core - bus independent part.") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Acked-by: Pavel Pisa Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20250114152138.139580-1-pchelkin@ispras.ru Signed-off-by: Marc Kleine-Budde --- drivers/net/can/ctucanfd/ctucanfd_base.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c index 64c349fd46007..f65c1a1e05ccd 100644 --- a/drivers/net/can/ctucanfd/ctucanfd_base.c +++ b/drivers/net/can/ctucanfd/ctucanfd_base.c @@ -867,10 +867,12 @@ static void ctucan_err_interrupt(struct net_device *ndev, u32 isr) } break; case CAN_STATE_ERROR_ACTIVE: - cf->can_id |= CAN_ERR_CNT; - cf->data[1] = CAN_ERR_CRTL_ACTIVE; - cf->data[6] = bec.txerr; - cf->data[7] = bec.rxerr; + if (skb) { + cf->can_id |= CAN_ERR_CNT; + cf->data[1] = CAN_ERR_CRTL_ACTIVE; + cf->data[6] = bec.txerr; + cf->data[7] = bec.rxerr; + } break; default: netdev_warn(ndev, "unhandled error state (%d:%s)!\n", -- GitLab From 257a2cd3eb578ee63d6bf90475dc4f4b16984139 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 12 Jan 2025 13:41:52 +0100 Subject: [PATCH 372/989] can: c_can: fix unbalanced runtime PM disable in error path Runtime PM is enabled as one of the last steps of probe(), so all earlier gotos to "exit_free_device" label were not correct and were leading to unbalanced runtime PM disable depth. Fixes: 6e2fe01dd6f9 ("can: c_can: move runtime PM enable/disable to c_can_platform") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20250112-syscon-phandle-args-can-v1-1-314d9549906f@linaro.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/c_can/c_can_platform.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index 6cba9717a6d87..399844809bbea 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -385,15 +385,16 @@ static int c_can_plat_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "registering %s failed (err=%d)\n", KBUILD_MODNAME, ret); - goto exit_free_device; + goto exit_pm_runtime; } dev_info(&pdev->dev, "%s device registered (regs=%p, irq=%d)\n", KBUILD_MODNAME, priv->base, dev->irq); return 0; -exit_free_device: +exit_pm_runtime: pm_runtime_disable(priv->device); +exit_free_device: free_c_can_dev(dev); exit: dev_err(&pdev->dev, "probe failed\n"); -- GitLab From a1ad2109ce41c9e3912dadd07ad8a9c640064ffb Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 5 Feb 2025 00:48:15 +0900 Subject: [PATCH 373/989] can: etas_es58x: fix potential NULL pointer dereference on udev->serial The driver assumed that es58x_dev->udev->serial could never be NULL. While this is true on commercially available devices, an attacker could spoof the device identity providing a NULL USB serial number. That would trigger a NULL pointer dereference. Add a check on es58x_dev->udev->serial before accessing it. Reported-by: yan kang Reported-by: yue sun Closes: https://lore.kernel.org/linux-can/SY8P300MB0421E0013C0EBD2AA46BA709A1F42@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Fixes: 9f06631c3f1f ("can: etas_es58x: export product information through devlink_ops::info_get()") Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250204154859.9797-2-mailhol.vincent@wanadoo.fr Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/etas_es58x/es58x_devlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/etas_es58x/es58x_devlink.c b/drivers/net/can/usb/etas_es58x/es58x_devlink.c index eee20839d96fd..0d155eb1b9e99 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_devlink.c +++ b/drivers/net/can/usb/etas_es58x/es58x_devlink.c @@ -248,7 +248,11 @@ static int es58x_devlink_info_get(struct devlink *devlink, return ret; } - return devlink_info_serial_number_put(req, es58x_dev->udev->serial); + if (es58x_dev->udev->serial) + ret = devlink_info_serial_number_put(req, + es58x_dev->udev->serial); + + return ret; } const struct devlink_ops es58x_dl_ops = { -- GitLab From f7f0adfe64de08803990dc4cbecd2849c04e314a Mon Sep 17 00:00:00 2001 From: Robin van der Gracht Date: Mon, 27 Jan 2025 13:16:44 +0100 Subject: [PATCH 374/989] can: rockchip: rkcanfd_handle_rx_fifo_overflow_int(): bail out if skb cannot be allocated Fix NULL pointer check in rkcanfd_handle_rx_fifo_overflow_int() to bail out if skb cannot be allocated. Fixes: ff60bfbaf67f ("can: rockchip_canfd: add driver for Rockchip CAN-FD controller") Cc: stable@vger.kernel.org Signed-off-by: Robin van der Gracht Link: https://patch.msgid.link/20250208-fix-rockchip-canfd-v1-1-ec533c8a9895@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rockchip/rockchip_canfd-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c index df18c85fc0784..d9a937ba126c3 100644 --- a/drivers/net/can/rockchip/rockchip_canfd-core.c +++ b/drivers/net/can/rockchip/rockchip_canfd-core.c @@ -622,7 +622,7 @@ rkcanfd_handle_rx_fifo_overflow_int(struct rkcanfd_priv *priv) netdev_dbg(priv->ndev, "RX-FIFO overflow\n"); skb = rkcanfd_alloc_can_err_skb(priv, &cf, ×tamp); - if (skb) + if (!skb) return 0; rkcanfd_get_berr_counter_corrected(priv, &bec); -- GitLab From c8c9b1d2d5b4377c72a979f5a26e842a869aefc9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 8 Feb 2025 00:15:11 -0500 Subject: [PATCH 375/989] fgraph: Fix set_graph_notrace with setting TRACE_GRAPH_NOTRACE_BIT The code was restructured where the function graph notrace code, that would not trace a function and all its children is done by setting a NOTRACE flag when the function that is not to be traced is hit. There's a TRACE_GRAPH_NOTRACE_BIT which defines the bit in the flags and a TRACE_GRAPH_NOTRACE which is the mask with that bit set. But the restructuring used TRACE_GRAPH_NOTRACE_BIT when it should have used TRACE_GRAPH_NOTRACE. For example: # cd /sys/kernel/tracing # echo set_track_prepare stack_trace_save > set_graph_notrace # echo function_graph > current_tracer # cat trace [..] 0) | __slab_free() { 0) | free_to_partial_list() { 0) | arch_stack_walk() { 0) | __unwind_start() { 0) 0.501 us | get_stack_info(); Where a non filter trace looks like: # echo > set_graph_notrace # cat trace 0) | free_to_partial_list() { 0) | set_track_prepare() { 0) | stack_trace_save() { 0) | arch_stack_walk() { 0) | __unwind_start() { Where the filter should look like: # cat trace 0) | free_to_partial_list() { 0) | _raw_spin_lock_irqsave() { 0) 0.350 us | preempt_count_add(); 0) 0.351 us | do_raw_spin_lock(); 0) 2.440 us | } Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250208001511.535be150@batman.local.home Fixes: b84214890a9bc ("function_graph: Move graph notrace bit to shadow stack global var") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 54d850997c0a1..136c750b0b4da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. -- GitLab From 3724062ca2b1364f02cf44dbea1a552227844ad1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Jan 2025 13:57:58 -0800 Subject: [PATCH 376/989] objtool: Ignore dangling jump table entries Clang sometimes leaves dangling unused jump table entries which point to the end of the function. Ignore them. Closes: https://lore.kernel.org/20250113235835.vqgvb7cdspksy5dn@jpoimboe Reported-by: Klaus Kusche Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ee25c0b7e80113e950bd1d4c208b671d35774ff4.1736891751.git.jpoimboe@kernel.org --- tools/objtool/check.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 753dbc4f81985..3520a45ebde8b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1975,6 +1975,14 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, reloc_addend(reloc) == pfunc->offset) break; + /* + * Clang sometimes leaves dangling unused jump table entries + * which point to the end of the function. Ignore them. + */ + if (reloc->sym->sec == pfunc->sec && + reloc_addend(reloc) == pfunc->offset + pfunc->len) + goto next; + dest_insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); if (!dest_insn) break; @@ -1992,6 +2000,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, alt->insn = dest_insn; alt->next = insn->alts; insn->alts = alt; +next: prev_offset = reloc_offset(reloc); } -- GitLab From 7e501637bd5b702a2fa627e903a0025654110e1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Feb 2025 11:12:08 +0100 Subject: [PATCH 377/989] objtool: Move dodgy linker warn to verbose The lld.ld borkage is fixed in the latest llvm release (?) but will not be backported, meaning we're stuck with broken linker for a fair while. Lets not spam all clang build logs and move warning to verbose. Signed-off-by: Peter Zijlstra (Intel) --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 3520a45ebde8b..497cb8dfb3eb3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2273,7 +2273,7 @@ static int read_annotate(struct objtool_file *file, if (sec->sh.sh_entsize != 8) { static bool warned = false; - if (!warned) { + if (!warned && opts.verbose) { WARN("%s: dodgy linker, sh_entsize != 8", sec->name); warned = true; } -- GitLab From bcc6244e13b4d4903511a1ea84368abf925031c0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 Jan 2025 20:53:03 +0100 Subject: [PATCH 378/989] sched: Clarify wake_up_q()'s write to task->wake_q.next Clarify that wake_up_q() does an atomic write to task->wake_q.next, after which a concurrent __wake_q_add() can immediately overwrite task->wake_q.next again. Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129-sched-wakeup-prettier-v1-1-2f51f5f663fa@google.com --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f91..8931d9b1e895e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1055,9 +1055,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with -- GitLab From 469c76a83bb9f6b2c7b2989c46617c4fe01fee79 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 29 Jan 2025 08:05:14 +0000 Subject: [PATCH 379/989] perf/x86/rapl: Fix the error checking order After the commit b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs"), the default "perf record"/"perf top" command is broken in systems where there isn't a PMU registered for type PERF_TYPE_RAW. This is due to the change in order of error checks in rapl_pmu_event_init() Due to which we return -EINVAL instead of -ENOENT, when we reach here from the fallback loop in perf_init_event(). Move the "PMU and event type match" back to the beginning of the function so that we return -ENOENT early on. Closes: https://lore.kernel.org/all/uv7mz6vew2bzgre5jdpmwldxljp5djzmuiksqdcdwipfm4zm7w@ribobcretidk/ Fixes: b4943b8bfc41 ("perf/x86/rapl: Add core energy counter support for AMD CPUs") Reported-by: Koichiro Den Signed-off-by: Dhananjay Ugwekar Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129080513.30353-1-dhananjay.ugwekar@amd.com --- arch/x86/events/rapl.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index d3bb3865c1b1f..4952faf03e82d 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -370,6 +370,10 @@ static int rapl_pmu_event_init(struct perf_event *event) unsigned int rapl_pmu_idx; struct rapl_pmus *rapl_pmus; + /* only look at RAPL events */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + /* unsupported modes and filters */ if (event->attr.sample_period) /* no sampling */ return -EINVAL; @@ -387,10 +391,6 @@ static int rapl_pmu_event_init(struct perf_event *event) rapl_pmus_scope = rapl_pmus->pmu.scope; if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { - /* only look at RAPL package events */ - if (event->attr.type != rapl_pmus_pkg->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; @@ -398,10 +398,6 @@ static int rapl_pmu_event_init(struct perf_event *event) bit = cfg - 1; event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { - /* only look at RAPL core events */ - if (event->attr.type != rapl_pmus_core->pmu.type) - return -ENOENT; - cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) return -EINVAL; -- GitLab From 0a5561501397e2bbd0fb0e300eb489f72a90597a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jan 2025 07:48:18 -0800 Subject: [PATCH 380/989] perf/x86/intel: Clean up PEBS-via-PT on hybrid The PEBS-via-PT feature is exposed for the e-core of some hybrid platforms, e.g., ADL and MTL. But it never works. $ dmesg | grep PEBS [ 1.793888] core: cpu_atom PMU driver: PEBS-via-PT $ perf record -c 1000 -e '{intel_pt/branch=0/, cpu_atom/cpu-cycles,aux-output/pp}' -C8 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_atom/cpu-cycles,aux-output/pp). "dmesg | grep -i perf" may provide additional information. The "PEBS-via-PT" is printed if the corresponding bit of per-PMU capabilities is set. Since the feature is supported by the e-core HW, perf sets the bit for e-core. However, for Intel PT, if a feature is not supported on all CPUs, it is not supported at all. The PEBS-via-PT event cannot be created successfully. The PEBS-via-PT is no longer enumerated on the latest hybrid platform. It will be deprecated on future platforms with Arch PEBS. Let's remove it from the existing hybrid platforms. Fixes: d9977c43bff8 ("perf/x86: Register hybrid PMUs") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250129154820.3755948-2-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 10 ---------- arch/x86/events/intel/ds.c | 10 +++++++++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18e..966f7832497de 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4941,11 +4941,6 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); - if (pmu->intel_cap.pebs_output_pt_available) - pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; - else - pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; - intel_pmu_check_event_constraints(pmu->event_constraints, pmu->cntr_mask64, pmu->fixed_cntr_mask64, @@ -5023,9 +5018,6 @@ static bool init_hybrid_pmu(int cpu) pr_info("%s PMU driver: ", pmu->name); - if (pmu->intel_cap.pebs_output_pt_available) - pr_cont("PEBS-via-PT "); - pr_cont("\n"); x86_pmu_show_pmu_cap(&pmu->pmu); @@ -6370,11 +6362,9 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; if (pmu->pmu_type & hybrid_small_tiny) { pmu->intel_cap.perf_metrics = 0; - pmu->intel_cap.pebs_output_pt_available = 1; pmu->mid_ack = true; } else if (pmu->pmu_type & hybrid_big) { pmu->intel_cap.perf_metrics = 1; - pmu->intel_cap.pebs_output_pt_available = 0; pmu->late_ack = true; } } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba74e11983280..c2e2eae7309c3 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2578,7 +2578,15 @@ void __init intel_ds_init(void) } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); - if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { + /* + * The PEBS-via-PT is not supported on hybrid platforms, + * because not all CPUs of a hybrid machine support it. + * The global x86_pmu.intel_cap, which only contains the + * common capabilities, is used to check the availability + * of the feature. The per-PMU pebs_output_pt_available + * in a hybrid machine should be ignored. + */ + if (x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } -- GitLab From 47a973fd75639fe80d59f9e1860113bb2a0b112b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jan 2025 07:48:19 -0800 Subject: [PATCH 381/989] perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF The EAX of the CPUID Leaf 023H enumerates the mask of valid sub-leaves. To tell the availability of the sub-leaf 1 (enumerate the counter mask), perf should check the bit 1 (0x2) of EAS, rather than bit 0 (0x1). The error is not user-visible on bare metal. Because the sub-leaf 0 and the sub-leaf 1 are always available. However, it may bring issues in a virtualization environment when a VMM only enumerates the sub-leaf 0. Introduce the cpuid35_e?x to replace the macros, which makes the implementation style consistent. Fixes: eb467aaac21e ("perf/x86/intel: Support Architectural PerfMon Extension leaf") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250129154820.3755948-3-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 18 ++++++++++-------- arch/x86/include/asm/perf_event.h | 28 +++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 966f7832497de..f3d5b718f93f3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4905,20 +4905,22 @@ static inline bool intel_pmu_broken_perf_cap(void) static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { - unsigned int sub_bitmaps, eax, ebx, ecx, edx; + unsigned int cntr, fixed_cntr, ecx, edx; + union cpuid35_eax eax; + union cpuid35_ebx ebx; - cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); - if (ebx & ARCH_PERFMON_EXT_UMASK2) + if (ebx.split.umask2) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx & ARCH_PERFMON_EXT_EQ) + if (ebx.split.eq) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; - if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { + if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &eax, &ebx, &ecx, &edx); - pmu->cntr_mask64 = eax; - pmu->fixed_cntr_mask64 = ebx; + &cntr, &fixed_cntr, &ecx, &edx); + pmu->cntr_mask64 = cntr; + pmu->fixed_cntr_mask64 = fixed_cntr; } if (!intel_pmu_broken_perf_cap()) { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 1ac79f3616456..0ba8d20f2d1d5 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,11 +188,33 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 -#define ARCH_PERFMON_EXT_UMASK2 0x1 -#define ARCH_PERFMON_EXT_EQ 0x2 -#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +union cpuid35_eax { + struct { + unsigned int leaf0:1; + /* Counters Sub-Leaf */ + unsigned int cntr_subleaf:1; + /* Auto Counter Reload Sub-Leaf */ + unsigned int acr_subleaf:1; + /* Events Sub-Leaf */ + unsigned int events_subleaf:1; + unsigned int reserved:28; + } split; + unsigned int full; +}; + +union cpuid35_ebx { + struct { + /* UnitMask2 Supported */ + unsigned int umask2:1; + /* EQ-bit Supported */ + unsigned int eq:1; + unsigned int reserved:30; + } split; + unsigned int full; +}; + /* * Intel Architectural LBR CPUID detection/enumeration details: */ -- GitLab From c631a2de7ae48d50434bdc205d901423f8577c65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Jan 2025 17:07:21 -0800 Subject: [PATCH 382/989] perf/x86/intel: Ensure LBRs are disabled when a CPU is starting Explicitly clear DEBUGCTL.LBR when a CPU is starting, prior to purging the LBR MSRs themselves, as at least one system has been found to transfer control to the kernel with LBRs enabled (it's unclear whether it's a BIOS flaw or a CPU goof). Because the kernel preserves the original DEBUGCTL, even when toggling LBRs, leaving DEBUGCTL.LBR as is results in running with LBRs enabled at all times. Closes: https://lore.kernel.org/all/c9d8269bff69f6359731d758e3b1135dedd7cc61.camel@redhat.com Reported-by: Maxim Levitsky Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maxim Levitsky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250131010721.470503-1-seanjc@google.com --- arch/x86/events/intel/core.c | 5 ++++- arch/x86/include/asm/msr-index.h | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f3d5b718f93f3..e86333eee2668 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5042,8 +5042,11 @@ static void intel_pmu_cpu_starting(int cpu) init_debug_store_on_cpu(cpu); /* - * Deal with CPUs that don't clear their LBRs on power-up. + * Deal with CPUs that don't clear their LBRs on power-up, and that may + * even boot with LBRs enabled. */ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && x86_pmu.lbr_nr) + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR_BIT); intel_pmu_lbr_reset(); cpuc->lbr_sel = NULL; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9a71880eec070..72765b2fe0d87 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -395,7 +395,8 @@ #define MSR_IA32_PASID_VALID BIT_ULL(31) /* DEBUGCTLMSR bits (others vary by model): */ -#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_LBR_BIT 0 /* last branch recording */ +#define DEBUGCTLMSR_LBR (1UL << DEBUGCTLMSR_LBR_BIT) #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) -- GitLab From 9ab127a18018fb06bd42a54ed38bb7b8c449d686 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Jan 2025 08:10:02 +0100 Subject: [PATCH 383/989] drm/hisilicon/hibmc: select CONFIG_DRM_DISPLAY_DP_HELPER Without the DP helper code, the newly added displayport support causes a link failure: x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_aux.o: in function `hibmc_dp_aux_init': dp_aux.c:(.text+0x37e): undefined reference to `drm_dp_aux_init' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_set_pattern': dp_link.c:(.text+0xae): undefined reference to `drm_dp_dpcd_write' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_get_adjust_train': dp_link.c:(.text+0x121): undefined reference to `drm_dp_get_adjust_request_voltage' x86_64-linux-ld: dp_link.c:(.text+0x12e): undefined reference to `drm_dp_get_adjust_request_pre_emphasis' x86_64-linux-ld: drivers/gpu/drm/hisilicon/hibmc/dp/dp_link.o: in function `hibmc_dp_link_training': dp_link.c:(.text+0x2b0): undefined reference to `drm_dp_dpcd_write' x86_64-linux-ld: dp_link.c:(.text+0x2e3): undefined reference to `drm_dp_dpcd_write' Add both DRM_DISPLAY_DP_HELPER and DRM_DISPLAY_HELPER, which is in turn required by the former. Fixes: 0ab6ea261c1f ("drm/hisilicon/hibmc: add dp module in hibmc") Signed-off-by: Arnd Bergmann Reviewed-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20250127071059.617567-1-arnd@kernel.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/hisilicon/hibmc/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/hisilicon/hibmc/Kconfig b/drivers/gpu/drm/hisilicon/hibmc/Kconfig index 93b8d32e3be16..98d77d74999d5 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/Kconfig +++ b/drivers/gpu/drm/hisilicon/hibmc/Kconfig @@ -4,6 +4,8 @@ config DRM_HISI_HIBMC depends on DRM && PCI depends on MMU select DRM_CLIENT_SELECTION + select DRM_DISPLAY_HELPER + select DRM_DISPLAY_DP_HELPER select DRM_KMS_HELPER select DRM_VRAM_HELPER select DRM_TTM -- GitLab From 2fa0fbeb69edd367b7c44f484e8dc5a5a1a311ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:58:23 -1000 Subject: [PATCH 384/989] sched_ext: Implement auto local dispatching of migration disabled tasks Migration disabled tasks are special and pinned to their previous CPUs. They tripped up some unsuspecting BPF schedulers as their ->nr_cpus_allowed may not agree with the bits set in ->cpus_ptr. Make it easier for BPF schedulers by automatically dispatching them to the pinned local DSQs by default. If a BPF scheduler wants to handle migration disabled tasks explicitly, it can set SCX_OPS_ENQ_MIGRATION_DISABLED. Signed-off-by: Tejun Heo Acked-by: Andrea Righi --- kernel/sched/ext.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index a6d6d6dadde51..efdbf4d85a215 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -122,6 +122,19 @@ enum scx_ops_flags { */ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + /* * CPU cgroup support flags */ @@ -130,6 +143,7 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -882,6 +896,7 @@ static bool scx_warned_zero_slice; static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, unlikely(p->flags & PF_EXITING)) goto local; + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) + goto local; + if (!SCX_HAS_OP(enqueue)) goto global; @@ -5052,6 +5072,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&scx_has_op[i]); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); @@ -5661,6 +5682,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); -- GitLab From 32966821574cd2917bd60f2554f435fe527f4702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 7 Feb 2025 10:59:06 -1000 Subject: [PATCH 385/989] sched_ext: Fix migration disabled handling in targeted dispatches A dispatch operation that can target a specific local DSQ - scx_bpf_dsq_move_to_local() or scx_bpf_dsq_move() - checks whether the task can be migrated to the target CPU using task_can_run_on_remote_rq(). If the task can't be migrated to the targeted CPU, it is bounced through a global DSQ. task_can_run_on_remote_rq() assumes that the task is on a CPU that's different from the targeted CPU but the callers doesn't uphold the assumption and may call the function when the task is already on the target CPU. When such task has migration disabled, task_can_run_on_remote_rq() ends up returning %false incorrectly unnecessarily bouncing the task to a global DSQ. Fix it by updating the callers to only call task_can_run_on_remote_rq() when the task is on a different CPU than the target CPU. As this is a bit subtle, for clarity and documentation: - Make task_can_run_on_remote_rq() trigger SCHED_WARN_ON() if the task is on the same CPU as the target CPU. - is_migration_disabled() test in task_can_run_on_remote_rq() cannot trigger if the task is on a different CPU than the target CPU as the preceding task_allowed_on_cpu() test should fail beforehand. Convert the test into SCHED_WARN_ON(). Signed-off-by: Tejun Heo Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Fixes: 0366017e0973 ("sched_ext: Use task_can_run_on_remote_rq() test in dispatch_to_local_dsq()") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/ext.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index efdbf4d85a215..e01144340d679 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2333,12 +2333,16 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { int cpu = cpu_of(rq); + SCHED_WARN_ON(task_cpu(p) == cpu); + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2352,8 +2356,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, return false; } - if (unlikely(is_migration_disabled(p))) - return false; + /* + * If @p has migration disabled, @p->cpus_ptr only contains its current + * CPU and the above task_allowed_on_cpu() test should have failed. + */ + SCHED_WARN_ON(is_migration_disabled(p)); if (!scx_rq_online(rq)) return false; @@ -2457,7 +2464,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2611,7 +2619,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; -- GitLab From ee9d1619ef6e4a3412a13788256cb8c3e5efbe3d Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 7 Feb 2025 13:46:28 -0800 Subject: [PATCH 386/989] MAINTAINERS: update maintainer for Microsoft MANA RDMA driver Ajay is no longer working on the MANA RDMA driver. Konstantin Taranov has made significant contributions to implementing RC QP in both kernel and user-mode. He will take the responsibility of fixing bugs, reviewing patches and developing new features for MANA RDMA driver. Signed-off-by: Long Li Link: https://patch.msgid.link/1738964792-21140-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa0654..c51503268a026 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15689,7 +15689,7 @@ F: include/uapi/linux/cciss*.h MICROSOFT MANA RDMA DRIVER M: Long Li -M: Ajay Sharma +M: Konstantin Taranov L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/mana/ -- GitLab From 9747c0c7791d4a5a62018a0c9c563dd2e6f6c1c0 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Sat, 8 Feb 2025 18:59:30 +0800 Subject: [PATCH 387/989] RDMA/hns: Fix mbox timing out by adding retry mechanism If a QP is modified to error state and a flush CQE process is triggered, the subsequent QP destruction mbox can still be successfully posted but will be blocked in HW until the flush CQE process finishes. This causes further mbox posting timeouts in driver. The blocking time is related to QP depth. Considering an extreme case where SQ depth and RQ depth are both 32K, the blocking time can reach about 135ms. This patch adds a retry mechanism for mbox posting. For each try, FW waits 15ms for HW to complete the previous mbox, otherwise return a timeout error code to driver. Counting other time consumption in FW, set 8 tries for mbox posting and a 5ms time gap before each retry to increase to a sufficient timeout limit. Fixes: 0425e3e6e0c7 ("RDMA/hns: Support flush cqe for hip08 in kernel space") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250208105930.522796-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 64 ++++++++++++++++------ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 + 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dded339802b33..160e8927d364e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1286,10 +1286,8 @@ static u32 hns_roce_cmdq_tx_timeout(u16 opcode, u32 tx_timeout) return tx_timeout; } -static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode) +static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u32 tx_timeout) { - struct hns_roce_v2_priv *priv = hr_dev->priv; - u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); u32 timeout = 0; do { @@ -1299,8 +1297,9 @@ static void hns_roce_wait_csq_done(struct hns_roce_dev *hr_dev, u16 opcode) } while (++timeout < tx_timeout); } -static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, - struct hns_roce_cmq_desc *desc, int num) +static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, + int num, u32 tx_timeout) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; @@ -1309,8 +1308,6 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, int ret; int i; - spin_lock_bh(&csq->lock); - tail = csq->head; for (i = 0; i < num; i++) { @@ -1324,22 +1321,17 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_CNT]); - hns_roce_wait_csq_done(hr_dev, le16_to_cpu(desc->opcode)); + hns_roce_wait_csq_done(hr_dev, tx_timeout); if (hns_roce_cmq_csq_done(hr_dev)) { ret = 0; for (i = 0; i < num; i++) { /* check the result of hardware write back */ - desc[i] = csq->desc[tail++]; + desc_ret = le16_to_cpu(csq->desc[tail++].retval); if (tail == csq->desc_num) tail = 0; - - desc_ret = le16_to_cpu(desc[i].retval); if (likely(desc_ret == CMD_EXEC_SUCCESS)) continue; - dev_err_ratelimited(hr_dev->dev, - "Cmdq IO error, opcode = 0x%x, return = 0x%x.\n", - desc->opcode, desc_ret); ret = hns_roce_cmd_err_convert_errno(desc_ret); } } else { @@ -1354,14 +1346,54 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, ret = -EAGAIN; } - spin_unlock_bh(&csq->lock); - if (ret) atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_CMDS_ERR_CNT]); return ret; } +static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, + struct hns_roce_cmq_desc *desc, int num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; + u16 opcode = le16_to_cpu(desc->opcode); + u32 tx_timeout = hns_roce_cmdq_tx_timeout(opcode, priv->cmq.tx_timeout); + u8 try_cnt = HNS_ROCE_OPC_POST_MB_TRY_CNT; + u32 rsv_tail; + int ret; + int i; + + while (try_cnt) { + try_cnt--; + + spin_lock_bh(&csq->lock); + rsv_tail = csq->head; + ret = __hns_roce_cmq_send_one(hr_dev, desc, num, tx_timeout); + if (opcode == HNS_ROCE_OPC_POST_MB && ret == -ETIME && + try_cnt) { + spin_unlock_bh(&csq->lock); + mdelay(HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC); + continue; + } + + for (i = 0; i < num; i++) { + desc[i] = csq->desc[rsv_tail++]; + if (rsv_tail == csq->desc_num) + rsv_tail = 0; + } + spin_unlock_bh(&csq->lock); + break; + } + + if (ret) + dev_err_ratelimited(hr_dev->dev, + "Cmdq IO error, opcode = 0x%x, return = %d.\n", + opcode, ret); + + return ret; +} + static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index cbdbc9edbce6e..91a5665465ffb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -230,6 +230,8 @@ enum hns_roce_opcode_type { }; #define HNS_ROCE_OPC_POST_MB_TIMEOUT 35000 +#define HNS_ROCE_OPC_POST_MB_TRY_CNT 8 +#define HNS_ROCE_OPC_POST_MB_RETRY_GAP_MSEC 5 struct hns_roce_cmdq_tx_timeout_map { u16 opcode; u32 tx_timeout; -- GitLab From c53fbdb60fb61fd6bda2bc0dc89837966625c5dc Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 7 Feb 2025 14:54:37 +0000 Subject: [PATCH 388/989] KVM: arm64: Improve error handling from check_host_shared_guest() The check_host_shared_guest() path expects to find a last-level valid PTE in the guest's stage-2 page-table. However, it checks the PTE's level before its validity, which makes it hard for callers to figure out what went wrong. To make error handling simpler, check the PTE's validity first. Signed-off-by: Quentin Perret Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250207145438.1333475-2-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 7ad7b133b81a8..41847c04b270f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -943,10 +943,10 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); if (ret) return ret; - if (level != KVM_PGTABLE_LAST_LEVEL) - return -E2BIG; if (!kvm_pte_valid(pte)) return -ENOENT; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; state = guest_get_page_state(pte, ipa); if (state != PKVM_PAGE_SHARED_BORROWED) -- GitLab From eabc7aaef7a553b64bf6e631ce04526af6c8d104 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 7 Feb 2025 14:54:38 +0000 Subject: [PATCH 389/989] KVM: arm64: Simplify np-guest hypercalls When the handling of a guest stage-2 permission fault races with an MMU notifier, the faulting page might be gone from the guest's stage-2 by the point we attempt to call (p)kvm_pgtable_stage2_relax_perms(). In the normal KVM case, this leads to returning -EAGAIN which user_mem_abort() handles correctly by simply re-entering the guest. However, the pKVM hypercall implementation has additional logic to check the page state using __check_host_shared_guest() which gets confused with absence of a page mapped at the requested IPA and returns -ENOENT, hence breaking user_mem_abort() and hilarity ensues. Luckily, several of the hypercalls for managing the stage-2 page-table of NP guests have no effect on the pKVM ownership tracking (wrprotect, test_clear_young, mkyoung, and crucially relax_perms), so the extra state checking logic is in fact not strictly necessary. So, to fix the discrepancy between standard KVM and pKVM, let's just drop the superfluous __check_host_shared_guest() logic from those hypercalls and make the extra state checking a debug assertion dependent on CONFIG_NVHE_EL2_DEBUG as we already do for other transitions. Signed-off-by: Quentin Perret Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250207145438.1333475-3-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 69 +++++++++++++++------------ 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 41847c04b270f..4c2f6a6a2efe1 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -998,63 +998,73 @@ int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret; } -int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) +static void assert_host_shared_guest(struct pkvm_hyp_vm *vm, u64 ipa) { - struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); - u64 ipa = hyp_pfn_to_phys(gfn); u64 phys; int ret; - if (prot & ~KVM_PGTABLE_PROT_RWX) - return -EINVAL; + if (!IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) + return; host_lock_component(); guest_lock_component(vm); ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); host_unlock_component(); - return ret; + WARN_ON(ret && ret != -ENOENT); } -int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) { + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); - guest_lock_component(vm); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); guest_unlock_component(vm); - host_unlock_component(); return ret; } -int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) { u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); guest_lock_component(vm); + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + guest_unlock_component(vm); - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + return ret; +} +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + int ret; + + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); guest_unlock_component(vm); - host_unlock_component(); return ret; } @@ -1063,18 +1073,15 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - u64 phys; int ret; - host_lock_component(); - guest_lock_component(vm); - - ret = __check_host_shared_guest(vm, &phys, ipa); - if (!ret) - kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + if (pkvm_hyp_vm_is_protected(vm)) + return -EPERM; + assert_host_shared_guest(vm, ipa); + guest_lock_component(vm); + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); - host_unlock_component(); return ret; } -- GitLab From 7585946243d614bd2cd4e13377be2c711c9539e0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 8 Feb 2025 18:54:28 +0100 Subject: [PATCH 390/989] PM: sleep: core: Restrict power.set_active propagation Commit 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") exposed an issue related to simple_pm_bus_pm_ops that uses pm_runtime_force_suspend() and pm_runtime_force_resume() as bus type PM callbacks for the noirq phases of system-wide suspend and resume. The problem is that pm_runtime_force_suspend() does not distinguish runtime-suspended devices from devices for which runtime PM has never been enabled, so if it sees a device with runtime PM status set to RPM_ACTIVE, it will assume that runtime PM is enabled for that device and so it will attempt to suspend it with the help of its runtime PM callbacks which may not be ready for that. As it turns out, this causes simple_pm_bus_runtime_suspend() to crash due to a NULL pointer dereference. Another problem related to the above commit and simple_pm_bus_pm_ops is that setting runtime PM status of a device handled by the latter to RPM_ACTIVE will actually prevent it from being resumed because pm_runtime_force_resume() only resumes devices with runtime PM status set to RPM_SUSPENDED. To mitigate these issues, do not allow power.set_active to propagate beyond the parent of the device with DPM_FLAG_SMART_SUSPEND set that will need to be resumed, which should be a sufficient stop-gap for the time being, but they will need to be properly addressed in the future because in general during system-wide resume it is necessary to resume all devices in a dependency chain in which at least one device is going to be resumed. Fixes: 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") Closes: https://lore.kernel.org/linux-pm/1c2433d4-7e0f-4395-b841-b8eac7c25651@nvidia.com/ Reported-by: Jon Hunter Tested-by: Johan Hovold Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/6137505.lOV4Wx5bFT@rjwysocki.net --- drivers/base/power/main.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d497d448e4b2a..40e1d8d8a5893 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1191,24 +1191,18 @@ static pm_message_t resume_event(pm_message_t sleep_state) return PMSG_ON; } -static void dpm_superior_set_must_resume(struct device *dev, bool set_active) +static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; - if (dev->parent) { + if (dev->parent) dev->parent->power.must_resume = true; - if (set_active) - dev->parent->power.set_active = true; - } idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; - if (set_active) - link->supplier->power.set_active = true; - } device_links_read_unlock(idx); } @@ -1287,9 +1281,12 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy dev->power.must_resume = true; if (dev->power.must_resume) { - dev->power.set_active = dev->power.set_active || - dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); - dpm_superior_set_must_resume(dev, dev->power.set_active); + if (dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) { + dev->power.set_active = true; + if (dev->parent && !dev->parent->power.ignore_children) + dev->parent->power.set_active = true; + } + dpm_superior_set_must_resume(dev); } Complete: -- GitLab From a64dcfb451e254085a7daee5fe51bf22959d52d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Feb 2025 12:45:03 -0800 Subject: [PATCH 391/989] Linux 6.14-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e0d63d9d94b9..89628e354ca7b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* -- GitLab From dc9c5166c3cb044f8a001e397195242fd6796eee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 3 Feb 2025 11:14:57 +0100 Subject: [PATCH 392/989] powerpc/code-patching: Disable KASAN report during patching via temporary mm Erhard reports the following KASAN hit on Talos II (power9) with kernel 6.13: [ 12.028126] ================================================================== [ 12.028198] BUG: KASAN: user-memory-access in copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028260] Write of size 8 at addr 0000187e458f2000 by task systemd/1 [ 12.028346] CPU: 87 UID: 0 PID: 1 Comm: systemd Tainted: G T 6.13.0-P9-dirty #3 [ 12.028408] Tainted: [T]=RANDSTRUCT [ 12.028446] Hardware name: T2P9D01 REV 1.01 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV [ 12.028500] Call Trace: [ 12.028536] [c000000008dbf3b0] [c000000001656a48] dump_stack_lvl+0xbc/0x110 (unreliable) [ 12.028609] [c000000008dbf3f0] [c0000000006e2fc8] print_report+0x6b0/0x708 [ 12.028666] [c000000008dbf4e0] [c0000000006e2454] kasan_report+0x164/0x300 [ 12.028725] [c000000008dbf600] [c0000000006e54d4] kasan_check_range+0x314/0x370 [ 12.028784] [c000000008dbf640] [c0000000006e6310] __kasan_check_write+0x20/0x40 [ 12.028842] [c000000008dbf660] [c000000000578e8c] copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028902] [c000000008dbf6a0] [c0000000000acfe4] __patch_instructions+0x194/0x210 [ 12.028965] [c000000008dbf6e0] [c0000000000ade80] patch_instructions+0x150/0x590 [ 12.029026] [c000000008dbf7c0] [c0000000001159bc] bpf_arch_text_copy+0x6c/0xe0 [ 12.029085] [c000000008dbf800] [c000000000424250] bpf_jit_binary_pack_finalize+0x40/0xc0 [ 12.029147] [c000000008dbf830] [c000000000115dec] bpf_int_jit_compile+0x3bc/0x930 [ 12.029206] [c000000008dbf990] [c000000000423720] bpf_prog_select_runtime+0x1f0/0x280 [ 12.029266] [c000000008dbfa00] [c000000000434b18] bpf_prog_load+0xbb8/0x1370 [ 12.029324] [c000000008dbfb70] [c000000000436ebc] __sys_bpf+0x5ac/0x2e00 [ 12.029379] [c000000008dbfd00] [c00000000043a228] sys_bpf+0x28/0x40 [ 12.029435] [c000000008dbfd20] [c000000000038eb4] system_call_exception+0x334/0x610 [ 12.029497] [c000000008dbfe50] [c00000000000c270] system_call_vectored_common+0xf0/0x280 [ 12.029561] --- interrupt: 3000 at 0x3fff82f5cfa8 [ 12.029608] NIP: 00003fff82f5cfa8 LR: 00003fff82f5cfa8 CTR: 0000000000000000 [ 12.029660] REGS: c000000008dbfe80 TRAP: 3000 Tainted: G T (6.13.0-P9-dirty) [ 12.029735] MSR: 900000000280f032 CR: 42004848 XER: 00000000 [ 12.029855] IRQMASK: 0 GPR00: 0000000000000169 00003fffdcf789a0 00003fff83067100 0000000000000005 GPR04: 00003fffdcf78a98 0000000000000090 0000000000000000 0000000000000008 GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 00003fff836ff7e0 c000000000010678 0000000000000000 GPR16: 0000000000000000 0000000000000000 00003fffdcf78f28 00003fffdcf78f90 GPR20: 0000000000000000 0000000000000000 0000000000000000 00003fffdcf78f80 GPR24: 00003fffdcf78f70 00003fffdcf78d10 00003fff835c7239 00003fffdcf78bd8 GPR28: 00003fffdcf78a98 0000000000000000 0000000000000000 000000011f547580 [ 12.030316] NIP [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030361] LR [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030405] --- interrupt: 3000 [ 12.030444] ================================================================== Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for Radix MMU") is inspired from x86 but unlike x86 is doesn't disable KASAN reports during patching. This wasn't a problem at the begining because __patch_mem() is not instrumented. Commit 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") use copy_to_kernel_nofault() to copy several instructions at once. But when using temporary mm the destination is not regular kernel memory but a kind of kernel-like memory located in user address space. Because it is not in kernel address space it is not covered by KASAN shadow memory. Since commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") KASAN reports bad accesses from copy_to_kernel_nofault(). Here a bad access to user memory is reported because KASAN detects the lack of shadow memory and the address is below TASK_SIZE. Do like x86 in commit b3fd8e83ada0 ("x86/alternatives: Use temporary mm for text poking") and disable KASAN reports during patching when using temporary mm. Reported-by: Erhard Furtner Close: https://lore.kernel.org/all/20250201151435.48400261@yea/ Fixes: 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") Signed-off-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1c05b2a1b02ad75b981cfc45927e0b4a90441046.1738577687.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index af97fbb3c257e..81c0f673eb252 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -493,7 +493,9 @@ static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool rep orig_mm = start_using_temp_mm(patching_mm); + kasan_disable_current(); err = __patch_instructions(patch_addr, code, len, repeat_instr); + kasan_enable_current(); /* context synchronisation performed by __patch_instructions */ stop_using_temp_mm(patching_mm, orig_mm); -- GitLab From 61bcc752d1b81fde3cae454ff20c1d3c359df500 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 12 Jan 2025 19:24:46 +0100 Subject: [PATCH 393/989] powerpc/64s: Rewrite __real_pte() and __rpte_to_hidx() as static inline Rewrite __real_pte() and __rpte_to_hidx() as static inline in order to avoid following warnings/errors when building with 4k page size: CC arch/powerpc/mm/book3s64/hash_tlb.o arch/powerpc/mm/book3s64/hash_tlb.c: In function 'hpte_need_flush': arch/powerpc/mm/book3s64/hash_tlb.c:49:16: error: variable 'offset' set but not used [-Werror=unused-but-set-variable] 49 | int i, offset; | ^~~~~~ CC arch/powerpc/mm/book3s64/hash_native.o arch/powerpc/mm/book3s64/hash_native.c: In function 'native_flush_hash_range': arch/powerpc/mm/book3s64/hash_native.c:782:29: error: variable 'index' set but not used [-Werror=unused-but-set-variable] 782 | unsigned long hash, index, hidx, shift, slot; | ^~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501081741.AYFwybsq-lkp@intel.com/ Fixes: ff31e105464d ("powerpc/mm/hash64: Store the slot information at the right offset for hugetlb") Signed-off-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/e0d340a5b7bd478ecbf245d826e6ab2778b74e06.1736706263.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index c3efacab4b941..aa90a048f319a 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -77,9 +77,17 @@ /* * With 4K page size the real_pte machinery is all nops. */ -#define __real_pte(e, p, o) ((real_pte_t){(e)}) +static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset) +{ + return (real_pte_t){pte}; +} + #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) + +static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) +{ + return pte_val(__rpte_to_pte(rpte)) >> H_PAGE_F_GIX_SHIFT; +} #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ -- GitLab From a27c6f46dcec8f697cbf15c8a10f8534c7b8a2c3 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 4 Feb 2025 00:21:22 -0800 Subject: [PATCH 394/989] RDMA/bnxt_re: Fix an issue in bnxt_re_async_notifier In the bnxt_re_async_notifier() callback, the way driver retrieves rdev pointer is wrong. The rdev pointer should be parsed from adev pointer as while registering with the L2 for ULP, driver uses the aux device pointer for the handle. Fixes: 7fea32784068 ("RDMA/bnxt_re: Add Async event handling support") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1738657285-23968-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e9e4da4dd576b..c4c3d67c42cc6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -396,11 +396,16 @@ static void bnxt_re_dcb_wq_task(struct work_struct *work) static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); struct bnxt_re_dcb_work *dcb_work; + struct bnxt_re_dev *rdev; u32 data1, data2; u16 event_id; + rdev = en_info->rdev; + if (!rdev) + return; + event_id = le16_to_cpu(cmpl->event_id); data1 = le32_to_cpu(cmpl->event_data1); data2 = le32_to_cpu(cmpl->event_data2); -- GitLab From f0df225d12fcb049429fb5bf5122afe143c2dd15 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 4 Feb 2025 00:21:23 -0800 Subject: [PATCH 395/989] RDMA/bnxt_re: Add sanity checks on rdev validity There is a possibility that ulp_irq_stop and ulp_irq_start callbacks will be called when the device is in detached state. This can cause a crash due to NULL pointer dereference as the rdev is already freed. Fixes: cc5b9b48d447 ("RDMA/bnxt_re: Recover the device when FW error is detected") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1738657285-23968-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index c4c3d67c42cc6..89ac5c21ca7ad 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -438,6 +438,8 @@ static void bnxt_re_stop_irq(void *handle, bool reset) int indx; rdev = en_info->rdev; + if (!rdev) + return; rcfw = &rdev->rcfw; if (reset) { @@ -466,6 +468,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) int indx, rc; rdev = en_info->rdev; + if (!rdev) + return; msix_ent = rdev->nqr->msix_entries; rcfw = &rdev->rcfw; if (!ent) { @@ -2438,6 +2442,7 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", __func__, en_dev->en_state); bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); + bnxt_re_update_en_info_rdev(NULL, en_info, adev); mutex_unlock(&bnxt_re_mutex); return 0; -- GitLab From e2f105277411c4ebacd00d4ae1a57f693ba7d22d Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 4 Feb 2025 00:21:24 -0800 Subject: [PATCH 396/989] RDMA/bnxt_re: Fix issue in the unload path The cited comment removed the netdev notifier register call from the driver. But, it did not remove the cleanup code from the unload path. As a result, driver unload is not clean and resulted in undesired behaviour. Fixes: d3b15fcc4201 ("RDMA/bnxt_re: Remove deliver net device event") Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1738657285-23968-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 - drivers/infiniband/hw/bnxt_re/main.c | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b91a85a491d05..3721446c6ba4b 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -187,7 +187,6 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; struct auxiliary_device *adev; - struct notifier_block nb; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 89ac5c21ca7ad..a94c8c5387d9e 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1359,7 +1359,6 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct auxiliary_device *adev, return NULL; } /* Default values */ - rdev->nb.notifier_call = NULL; rdev->netdev = en_dev->net; rdev->en_dev = en_dev; rdev->adev = adev; @@ -2354,15 +2353,6 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, struct auxiliary_device *aux_dev) { - if (rdev->nb.notifier_call) { - unregister_netdevice_notifier(&rdev->nb); - rdev->nb.notifier_call = NULL; - } else { - /* If notifier is null, we should have already done a - * clean up before coming here. - */ - return; - } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); bnxt_re_dev_uninit(rdev, op_type); -- GitLab From 8238c7bd84209c8216b1381ab0dbe6db9e203769 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 4 Feb 2025 00:21:25 -0800 Subject: [PATCH 397/989] RDMA/bnxt_re: Fix the statistics for Gen P7 VF Gen P7 VF support the extended stats and is prevented by a VF check. Fix the check to issue the FW command for GenP7 VFs also. Fixes: 1801d87b3598 ("RDMA/bnxt_re: Support new 5760X P7 devices") Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1738657285-23968-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/hw_counters.c | 4 ++-- drivers/infiniband/hw/bnxt_re/qplib_res.h | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 3ac47f4e61229..f039aefcaf675 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -348,8 +348,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } bnxt_re_copy_err_stats(rdev, stats, err_s); - if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags) && - !rdev->is_virtfn) { + if (bnxt_ext_stats_supported(rdev->chip_ctx, rdev->dev_attr->dev_cap_flags, + rdev->is_virtfn)) { rc = bnxt_re_get_ext_stat(rdev, stats); if (rc) { clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index be5d907a036b6..711990232de1c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -547,6 +547,14 @@ static inline bool _is_ext_stats_supported(u16 dev_cap_flags) CREQ_QUERY_FUNC_RESP_SB_EXT_STATS; } +static inline int bnxt_ext_stats_supported(struct bnxt_qplib_chip_ctx *ctx, + u16 flags, bool virtfn) +{ + /* ext stats supported if cap flag is set AND is a PF OR a Thor2 VF */ + return (_is_ext_stats_supported(flags) && + ((virtfn && bnxt_qplib_is_chip_gen_p7(ctx)) || (!virtfn))); +} + static inline bool _is_hw_retx_supported(u16 dev_cap_flags) { return dev_cap_flags & -- GitLab From 8dbccafce3c8ae026606f5c7bc6637667d9d5595 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 10 Feb 2025 09:17:58 +0000 Subject: [PATCH 398/989] KVM: arm64: Fix __pkvm_host_mkyoung_guest() return value Don't use an uninitialised stack variable, and just return 0 on the non-error path. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502100911.8c9DbtKD-lkp@intel.com/ Reviewed-by: Quentin Perret Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 4c2f6a6a2efe1..19c3c631708ce 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1073,7 +1073,6 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); u64 ipa = hyp_pfn_to_phys(gfn); - int ret; if (pkvm_hyp_vm_is_protected(vm)) return -EPERM; @@ -1083,5 +1082,5 @@ int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); guest_unlock_component(vm); - return ret; + return 0; } -- GitLab From b921f66ccf5e8cf1b8a5052b35ceda454f19f5dd Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Mon, 3 Feb 2025 15:30:56 -0600 Subject: [PATCH 399/989] dt-bindings: rockchip: pmu: Ensure all properties are defined Device specific schemas should not allow undefined properties which is what 'additionalProperties: true' allows. Add the missing child nodes and fix this constraint. Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250203213056.13827-1-robh@kernel.org Signed-off-by: Heiko Stuebner --- Documentation/devicetree/bindings/arm/rockchip/pmu.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml index 932f981265ccb..52016a141227b 100644 --- a/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml +++ b/Documentation/devicetree/bindings/arm/rockchip/pmu.yaml @@ -53,11 +53,17 @@ properties: reg: maxItems: 1 + power-controller: + type: object + + reboot-mode: + type: object + required: - compatible - reg -additionalProperties: true +additionalProperties: false examples: - | -- GitLab From 2c202e6c4f4dd19d2e8c1dfac9df05170aa3934f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 8 Feb 2025 08:29:15 +0900 Subject: [PATCH 400/989] ata: libahci_platform: Do not set mask_port_map when not needed Commit 8c87215dd3a2 ("ata: libahci_platform: support non-consecutive port numbers") modified ahci_platform_get_resources() to allow identifying the ports of a controller that are defined as child nodes of the controller node in order to support non-consecutive port numbers (as defined by the platform device tree). However, this commit also erroneously sets bit 0 of hpriv->mask_port_map when the platform devices tree does not define port child nodes, to match the fact that the temporary default number of ports used in that case is 1 (which is also consistent with the fact that only index 0 of hpriv->phys[] is initialized with the call to ahci_platform_get_phy(). But doing so causes ahci_platform_init_host() to initialize and probe only the first port, even if this function determines that the controller has in fact multiple ports using the capability register of the controller (through a call to ahci_nr_ports()). This can be seen with the ahci_mvebu driver (Armada 385 SoC) with the second port declared as "dummy": ahci-mvebu f10a8000.sata: masking port_map 0x3 -> 0x1 ahci-mvebu f10a8000.sata: AHCI vers 0001.0000, 32 command slots, 6 Gbps, platform mode ahci-mvebu f10a8000.sata: 1/2 ports implemented (port mask 0x1) ahci-mvebu f10a8000.sata: flags: 64bit ncq sntf led only pmp fbs pio slum part sxs scsi host0: ahci-mvebu scsi host1: ahci-mvebu ata1: SATA max UDMA/133 mmio [mem 0xf10a8000-0xf10a9fff] port 0x100 irq 40 lpm-pol 0 ata2: DUMMY Fix this issue by removing setting bit 0 of hpriv->mask_port_map when the platform device tree does not define port child nodes. Reported-by: Klaus Kudielka Fixes: 8c87215dd3a2 ("ata: libahci_platform: support non-consecutive port numbers") Tested-by: Klaus Kudielka Signed-off-by: Damien Le Moal Acked-by: Josua Mayer Link: https://lore.kernel.org/r/20250207232915.1439174-1-dlemoal@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libahci_platform.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 53b2c7719dc51..91d44302eac92 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -651,8 +651,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, * If no sub-node was found, keep this for device tree * compatibility */ - hpriv->mask_port_map |= BIT(0); - rc = ahci_platform_get_phy(hpriv, 0, dev, dev->of_node); if (rc) goto err_out; -- GitLab From 9759ae2cee7cd42b95f1c48aa3749bd02b5ddb08 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 17 Jan 2025 13:58:00 +0800 Subject: [PATCH 401/989] iommu: Fix potential memory leak in iopf_queue_remove_device() The iopf_queue_remove_device() helper removes a device from the per-iommu iopf queue when PRI is disabled on the device. It responds to all outstanding iopf's with an IOMMU_PAGE_RESP_INVALID code and detaches the device from the queue. However, it fails to release the group structure that represents a group of iopf's awaiting for a response after responding to the hardware. This can cause a memory leak if iopf_queue_remove_device() is called with pending iopf's. Fix it by calling iopf_free_group() after the iopf group is responded. Fixes: 199112327135 ("iommu: Track iopf group instead of last fault") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250117055800.782462-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgfault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 4674e618797c1..8b5926c1452ed 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -478,6 +478,7 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev) ops->page_response(dev, iopf, &resp); list_del_init(&group->pending_node); + iopf_free_group(group); } mutex_unlock(&fault_param->lock); -- GitLab From 17987453a9d997c4d0749abc52f047bfa275427a Mon Sep 17 00:00:00 2001 From: Mohan Kumar D Date: Mon, 10 Feb 2025 19:24:12 +0530 Subject: [PATCH 402/989] dmaengine: tegra210-adma: Use div_u64 for 64 bit division The ADMA base and page address are represented using a 64-bit variable. To accurately derive the exact ADMA page number provided from the DT properties, use the div_u64() to divide the address difference between adma page and base address by the page offset. This change fixes the below error "ERROR: modpost: "__udivdi3" [drivers/dma/tegra210-adma.ko] undefined! ld: drivers/dma/tegra210-adma.o: in function `tegra_adma_probe': tegra210-adma.c:(.text+0x12cf): undefined reference to `__udivdi3'" Fixes: 68811c928f88 ("dmaengine: tegra210-adma: Support channel page") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412250204.GCQhdKe3-lkp@intel.com/ Signed-off-by: Mohan Kumar D Reviewed-by: Jon Hunter Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20250210135413.2504272-2-mkumard@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra210-adma.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index 6896da8ac7ef6..a0bd4822ed808 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -887,7 +887,8 @@ static int tegra_adma_probe(struct platform_device *pdev) const struct tegra_adma_chip_data *cdata; struct tegra_adma *tdma; struct resource *res_page, *res_base; - int ret, i, page_no; + u64 page_no, page_offset; + int ret, i; cdata = of_device_get_match_data(&pdev->dev); if (!cdata) { @@ -914,10 +915,16 @@ static int tegra_adma_probe(struct platform_device *pdev) res_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "global"); if (res_base) { - page_no = (res_page->start - res_base->start) / cdata->ch_base_offset; - if (page_no <= 0) + if (WARN_ON(res_page->start <= res_base->start)) return -EINVAL; - tdma->ch_page_no = page_no - 1; + + page_offset = res_page->start - res_base->start; + page_no = div_u64(page_offset, cdata->ch_base_offset); + + if (WARN_ON(page_no == 0)) + return -EINVAL; + + tdma->ch_page_no = lower_32_bits(page_no) - 1; tdma->base_addr = devm_ioremap_resource(&pdev->dev, res_base); if (IS_ERR(tdma->base_addr)) return PTR_ERR(tdma->base_addr); -- GitLab From 76ed9b7d177ed5aa161a824ea857619b88542de1 Mon Sep 17 00:00:00 2001 From: Mohan Kumar D Date: Mon, 10 Feb 2025 19:24:13 +0530 Subject: [PATCH 403/989] dmaengine: tegra210-adma: check for adma max page Have additional check for max channel page during the probe to cover if any offset overshoot happens due to wrong DT configuration. Fixes: 68811c928f88 ("dmaengine: tegra210-adma: Support channel page") Cc: stable@vger.kernel.org Signed-off-by: Mohan Kumar D Reviewed-by: Jon Hunter Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20250210135413.2504272-3-mkumard@nvidia.com Signed-off-by: Vinod Koul --- drivers/dma/tegra210-adma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index a0bd4822ed808..801740ad8e0d9 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -83,7 +83,9 @@ struct tegra_adma; * @nr_channels: Number of DMA channels available. * @ch_fifo_size_mask: Mask for FIFO size field. * @sreq_index_offset: Slave channel index offset. + * @max_page: Maximum ADMA Channel Page. * @has_outstanding_reqs: If DMA channel can have outstanding requests. + * @set_global_pg_config: Global page programming. */ struct tegra_adma_chip_data { unsigned int (*adma_get_burst_config)(unsigned int burst_size); @@ -99,6 +101,7 @@ struct tegra_adma_chip_data { unsigned int nr_channels; unsigned int ch_fifo_size_mask; unsigned int sreq_index_offset; + unsigned int max_page; bool has_outstanding_reqs; void (*set_global_pg_config)(struct tegra_adma *tdma); }; @@ -854,6 +857,7 @@ static const struct tegra_adma_chip_data tegra210_chip_data = { .nr_channels = 22, .ch_fifo_size_mask = 0xf, .sreq_index_offset = 2, + .max_page = 0, .has_outstanding_reqs = false, .set_global_pg_config = NULL, }; @@ -871,6 +875,7 @@ static const struct tegra_adma_chip_data tegra186_chip_data = { .nr_channels = 32, .ch_fifo_size_mask = 0x1f, .sreq_index_offset = 4, + .max_page = 4, .has_outstanding_reqs = true, .set_global_pg_config = tegra186_adma_global_page_config, }; @@ -921,7 +926,7 @@ static int tegra_adma_probe(struct platform_device *pdev) page_offset = res_page->start - res_base->start; page_no = div_u64(page_offset, cdata->ch_base_offset); - if (WARN_ON(page_no == 0)) + if (WARN_ON(page_no == 0 || page_no > cdata->max_page)) return -EINVAL; tdma->ch_page_no = lower_32_bits(page_no) - 1; -- GitLab From 1046cac109225eda0973b898e053aeb3d6c10e1d Mon Sep 17 00:00:00 2001 From: Sybil Isabel Dorsett Date: Mon, 3 Feb 2025 16:33:15 +0000 Subject: [PATCH 404/989] platform/x86: thinkpad_acpi: Fix invalid fan speed on ThinkPad X120e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ThinkPad X120e, fan speed is reported in ticks per revolution rather than RPM. Recalculate the fan speed value reported for ThinkPad X120e to RPM based on a 22.5 kHz clock. Based on the information on https://www.thinkwiki.org/wiki/How_to_control_fan_speed, the same problem is highly likely to be relevant to at least Edge11, but Edge11 is not addressed in this patch. Signed-off-by: Sybil Isabel Dorsett Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20250203163255.5525-1-sybdorsett@proton.me Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/thinkpad_acpi.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 1fcb0f99695a7..e7778ea41478e 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -7885,6 +7885,7 @@ static struct ibm_struct volume_driver_data = { #define FAN_NS_CTRL_STATUS BIT(2) /* Bit which determines control is enabled or not */ #define FAN_NS_CTRL BIT(4) /* Bit which determines control is by host or EC */ +#define FAN_CLOCK_TPM (22500*60) /* Ticks per minute for a 22.5 kHz clock */ enum { /* Fan control constants */ fan_status_offset = 0x2f, /* EC register 0x2f */ @@ -7940,6 +7941,7 @@ static int fan_watchdog_maxinterval; static bool fan_with_ns_addr; static bool ecfw_with_fan_dec_rpm; +static bool fan_speed_in_tpr; static struct mutex fan_mutex; @@ -8142,8 +8144,11 @@ static int fan_get_speed(unsigned int *speed) !acpi_ec_read(fan_rpm_offset + 1, &hi))) return -EIO; - if (likely(speed)) + if (likely(speed)) { *speed = (hi << 8) | lo; + if (fan_speed_in_tpr && *speed != 0) + *speed = FAN_CLOCK_TPM / *speed; + } break; case TPACPI_FAN_RD_TPEC_NS: if (!acpi_ec_read(fan_rpm_status_ns, &lo)) @@ -8176,8 +8181,11 @@ static int fan2_get_speed(unsigned int *speed) if (rc) return -EIO; - if (likely(speed)) + if (likely(speed)) { *speed = (hi << 8) | lo; + if (fan_speed_in_tpr && *speed != 0) + *speed = FAN_CLOCK_TPM / *speed; + } break; case TPACPI_FAN_RD_TPEC_NS: @@ -8788,6 +8796,7 @@ static const struct attribute_group fan_driver_attr_group = { #define TPACPI_FAN_NOFAN 0x0008 /* no fan available */ #define TPACPI_FAN_NS 0x0010 /* For EC with non-Standard register addresses */ #define TPACPI_FAN_DECRPM 0x0020 /* For ECFW's with RPM in register as decimal */ +#define TPACPI_FAN_TPR 0x0040 /* Fan speed is in Ticks Per Revolution */ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_QEC_IBM('1', 'Y', TPACPI_FAN_Q1), @@ -8817,6 +8826,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('R', '0', 'V', TPACPI_FAN_NS), /* 11e Gen5 KL-Y */ TPACPI_Q_LNV3('N', '1', 'O', TPACPI_FAN_NOFAN), /* X1 Tablet (2nd gen) */ TPACPI_Q_LNV3('R', '0', 'Q', TPACPI_FAN_DECRPM),/* L480 */ + TPACPI_Q_LNV('8', 'F', TPACPI_FAN_TPR), /* ThinkPad x120e */ }; static int __init fan_init(struct ibm_init_struct *iibm) @@ -8887,6 +8897,8 @@ static int __init fan_init(struct ibm_init_struct *iibm) if (quirks & TPACPI_FAN_Q1) fan_quirk1_setup(); + if (quirks & TPACPI_FAN_TPR) + fan_speed_in_tpr = true; /* Try and probe the 2nd fan */ tp_features.second_fan = 1; /* needed for get_speed to work */ res = fan2_get_speed(&speed); -- GitLab From 9cff907cbf8c7fb5345918dbcc7b74a01656f34f Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Thu, 6 Feb 2025 14:39:41 -0500 Subject: [PATCH 405/989] platform/x86: thinkpad_acpi: Support for V9 DYTC platform profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer Thinkpad AMD platforms are using V9 DYTC and this changes the profiles used for PSC mode. Add support for this update. Tested on P14s G5 AMD Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20250206193953.58365-1-mpearson-lenovo@squebb.ca Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/thinkpad_acpi.c | 34 ++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index e7778ea41478e..99cdea723d32c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -10331,6 +10331,10 @@ static struct ibm_struct proxsensor_driver_data = { #define DYTC_MODE_PSC_BALANCE 5 /* Default mode aka balanced */ #define DYTC_MODE_PSC_PERFORM 7 /* High power mode aka performance */ +#define DYTC_MODE_PSCV9_LOWPOWER 1 /* Low power mode */ +#define DYTC_MODE_PSCV9_BALANCE 3 /* Default mode aka balanced */ +#define DYTC_MODE_PSCV9_PERFORM 4 /* High power mode aka performance */ + #define DYTC_ERR_MASK 0xF /* Bits 0-3 in cmd result are the error result */ #define DYTC_ERR_SUCCESS 1 /* CMD completed successful */ @@ -10351,6 +10355,10 @@ static int dytc_capabilities; static bool dytc_mmc_get_available; static int profile_force; +static int platform_psc_profile_lowpower = DYTC_MODE_PSC_LOWPOWER; +static int platform_psc_profile_balanced = DYTC_MODE_PSC_BALANCE; +static int platform_psc_profile_performance = DYTC_MODE_PSC_PERFORM; + static int convert_dytc_to_profile(int funcmode, int dytcmode, enum platform_profile_option *profile) { @@ -10372,19 +10380,15 @@ static int convert_dytc_to_profile(int funcmode, int dytcmode, } return 0; case DYTC_FUNCTION_PSC: - switch (dytcmode) { - case DYTC_MODE_PSC_LOWPOWER: + if (dytcmode == platform_psc_profile_lowpower) *profile = PLATFORM_PROFILE_LOW_POWER; - break; - case DYTC_MODE_PSC_BALANCE: + else if (dytcmode == platform_psc_profile_balanced) *profile = PLATFORM_PROFILE_BALANCED; - break; - case DYTC_MODE_PSC_PERFORM: + else if (dytcmode == platform_psc_profile_performance) *profile = PLATFORM_PROFILE_PERFORMANCE; - break; - default: /* Unknown mode */ + else return -EINVAL; - } + return 0; case DYTC_FUNCTION_AMT: /* For now return balanced. It's the closest we have to 'auto' */ @@ -10405,19 +10409,19 @@ static int convert_profile_to_dytc(enum platform_profile_option profile, int *pe if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_LOWPOWER; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_LOWPOWER; + *perfmode = platform_psc_profile_lowpower; break; case PLATFORM_PROFILE_BALANCED: if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_BALANCE; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_BALANCE; + *perfmode = platform_psc_profile_balanced; break; case PLATFORM_PROFILE_PERFORMANCE: if (dytc_capabilities & BIT(DYTC_FC_MMC)) *perfmode = DYTC_MODE_MMC_PERFORM; else if (dytc_capabilities & BIT(DYTC_FC_PSC)) - *perfmode = DYTC_MODE_PSC_PERFORM; + *perfmode = platform_psc_profile_performance; break; default: /* Unknown profile */ return -EOPNOTSUPP; @@ -10611,6 +10615,7 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) if (output & BIT(DYTC_QUERY_ENABLE_BIT)) dytc_version = (output >> DYTC_QUERY_REV_BIT) & 0xF; + dbg_printk(TPACPI_DBG_INIT, "DYTC version %d\n", dytc_version); /* Check DYTC is enabled and supports mode setting */ if (dytc_version < 5) return -ENODEV; @@ -10649,6 +10654,11 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) } } else if (dytc_capabilities & BIT(DYTC_FC_PSC)) { /* PSC MODE */ pr_debug("PSC is supported\n"); + if (dytc_version >= 9) { /* update profiles for DYTC 9 and up */ + platform_psc_profile_lowpower = DYTC_MODE_PSCV9_LOWPOWER; + platform_psc_profile_balanced = DYTC_MODE_PSCV9_BALANCE; + platform_psc_profile_performance = DYTC_MODE_PSCV9_PERFORM; + } } else { dbg_printk(TPACPI_DBG_INIT, "No DYTC support available\n"); return -ENODEV; -- GitLab From 2b9df00cded911e2ca2cfae5c45082166b24f8aa Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Mon, 10 Feb 2025 13:35:49 +0800 Subject: [PATCH 406/989] mtd: rawnand: cadence: fix error code in cadence_nand_init() Replace dma_request_channel() with dma_request_chan_by_mask() and use helper functions to return proper error code instead of fixed -EBUSY. Fixes: ec4ba01e894d ("mtd: rawnand: Add new Cadence NAND driver to MTD subsystem") Cc: stable@vger.kernel.org Signed-off-by: Niravkumar L Rabara Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/cadence-nand-controller.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 8d1d710e439dd..fb5f671bdb7bb 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -2904,11 +2904,10 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) dma_cap_set(DMA_MEMCPY, mask); if (cdns_ctrl->caps1->has_dma) { - cdns_ctrl->dmac = dma_request_channel(mask, NULL, NULL); - if (!cdns_ctrl->dmac) { - dev_err(cdns_ctrl->dev, - "Unable to get a DMA channel\n"); - ret = -EBUSY; + cdns_ctrl->dmac = dma_request_chan_by_mask(&mask); + if (IS_ERR(cdns_ctrl->dmac)) { + ret = dev_err_probe(cdns_ctrl->dev, PTR_ERR(cdns_ctrl->dmac), + "%d: Failed to get a DMA channel\n", ret); goto disable_irq; } } -- GitLab From d76d22b5096c5b05208fd982b153b3f182350b19 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Mon, 10 Feb 2025 13:35:50 +0800 Subject: [PATCH 407/989] mtd: rawnand: cadence: use dma_map_resource for sdma address Remap the slave DMA I/O resources to enhance driver portability. Using a physical address causes DMA translation failure when the ARM SMMU is enabled. Fixes: ec4ba01e894d ("mtd: rawnand: Add new Cadence NAND driver to MTD subsystem") Cc: stable@vger.kernel.org Signed-off-by: Niravkumar L Rabara Signed-off-by: Miquel Raynal --- .../mtd/nand/raw/cadence-nand-controller.c | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index fb5f671bdb7bb..47950a0ac6d28 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -471,6 +471,8 @@ struct cdns_nand_ctrl { struct { void __iomem *virt; dma_addr_t dma; + dma_addr_t iova_dma; + u32 size; } io; int irq; @@ -1835,11 +1837,11 @@ static int cadence_nand_slave_dma_transfer(struct cdns_nand_ctrl *cdns_ctrl, } if (dir == DMA_FROM_DEVICE) { - src_dma = cdns_ctrl->io.dma; + src_dma = cdns_ctrl->io.iova_dma; dst_dma = buf_dma; } else { src_dma = buf_dma; - dst_dma = cdns_ctrl->io.dma; + dst_dma = cdns_ctrl->io.iova_dma; } tx = dmaengine_prep_dma_memcpy(cdns_ctrl->dmac, dst_dma, src_dma, len, @@ -2869,6 +2871,7 @@ cadence_nand_irq_cleanup(int irqnum, struct cdns_nand_ctrl *cdns_ctrl) static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) { dma_cap_mask_t mask; + struct dma_device *dma_dev = cdns_ctrl->dmac->device; int ret; cdns_ctrl->cdma_desc = dma_alloc_coherent(cdns_ctrl->dev, @@ -2912,6 +2915,16 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) } } + cdns_ctrl->io.iova_dma = dma_map_resource(dma_dev->dev, cdns_ctrl->io.dma, + cdns_ctrl->io.size, + DMA_BIDIRECTIONAL, 0); + + ret = dma_mapping_error(dma_dev->dev, cdns_ctrl->io.iova_dma); + if (ret) { + dev_err(cdns_ctrl->dev, "Failed to map I/O resource to DMA\n"); + goto dma_release_chnl; + } + nand_controller_init(&cdns_ctrl->controller); INIT_LIST_HEAD(&cdns_ctrl->chips); @@ -2922,18 +2935,22 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) if (ret) { dev_err(cdns_ctrl->dev, "Failed to register MTD: %d\n", ret); - goto dma_release_chnl; + goto unmap_dma_resource; } kfree(cdns_ctrl->buf); cdns_ctrl->buf = kzalloc(cdns_ctrl->buf_size, GFP_KERNEL); if (!cdns_ctrl->buf) { ret = -ENOMEM; - goto dma_release_chnl; + goto unmap_dma_resource; } return 0; +unmap_dma_resource: + dma_unmap_resource(dma_dev->dev, cdns_ctrl->io.iova_dma, + cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); + dma_release_chnl: if (cdns_ctrl->dmac) dma_release_channel(cdns_ctrl->dmac); @@ -2955,6 +2972,8 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) static void cadence_nand_remove(struct cdns_nand_ctrl *cdns_ctrl) { cadence_nand_chips_cleanup(cdns_ctrl); + dma_unmap_resource(cdns_ctrl->dmac->device->dev, cdns_ctrl->io.iova_dma, + cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); cadence_nand_irq_cleanup(cdns_ctrl->irq, cdns_ctrl); kfree(cdns_ctrl->buf); dma_free_coherent(cdns_ctrl->dev, sizeof(struct cadence_nand_cdma_desc), @@ -3019,7 +3038,9 @@ static int cadence_nand_dt_probe(struct platform_device *ofdev) cdns_ctrl->io.virt = devm_platform_get_and_ioremap_resource(ofdev, 1, &res); if (IS_ERR(cdns_ctrl->io.virt)) return PTR_ERR(cdns_ctrl->io.virt); + cdns_ctrl->io.dma = res->start; + cdns_ctrl->io.size = resource_size(res); dt->clk = devm_clk_get(cdns_ctrl->dev, "nf_clk"); if (IS_ERR(dt->clk)) -- GitLab From f37d135b42cb484bdecee93f56b9f483214ede78 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Mon, 10 Feb 2025 13:35:51 +0800 Subject: [PATCH 408/989] mtd: rawnand: cadence: fix incorrect device in dma_unmap_single dma_map_single is using physical/bus device (DMA) but dma_unmap_single is using framework device(NAND controller), which is incorrect. Fixed dma_unmap_single to use correct physical/bus device. Fixes: ec4ba01e894d ("mtd: rawnand: Add new Cadence NAND driver to MTD subsystem") Cc: stable@vger.kernel.org Signed-off-by: Niravkumar L Rabara Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/cadence-nand-controller.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 47950a0ac6d28..0b2db4173e723 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -1863,12 +1863,12 @@ static int cadence_nand_slave_dma_transfer(struct cdns_nand_ctrl *cdns_ctrl, dma_async_issue_pending(cdns_ctrl->dmac); wait_for_completion(&finished); - dma_unmap_single(cdns_ctrl->dev, buf_dma, len, dir); + dma_unmap_single(dma_dev->dev, buf_dma, len, dir); return 0; err_unmap: - dma_unmap_single(cdns_ctrl->dev, buf_dma, len, dir); + dma_unmap_single(dma_dev->dev, buf_dma, len, dir); err: dev_dbg(cdns_ctrl->dev, "Fall back to CPU I/O\n"); -- GitLab From fc876c9524e2a9f816f51d533ed31df789cff65a Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Wed, 5 Feb 2025 10:40:42 +0530 Subject: [PATCH 409/989] drm/xe/client: bo->client does not need bos_lock bos_lock is to protect list of bos used by client, it is not required to protect bo->client so bring it outside of bos_lock. Fixes: b27970f3e11c ("drm/xe: Add tracking support for bos per client") Signed-off-by: Tejas Upadhyay Reviewed-by: Himal Prasad Ghimiray Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20250205051042.1991192-1-tejas.upadhyay@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit f74fd53ba34551b7626193fb70c17226f06e9bf1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_drm_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c index 63f30b6df70b9..2d4874d2b9225 100644 --- a/drivers/gpu/drm/xe/xe_drm_client.c +++ b/drivers/gpu/drm/xe/xe_drm_client.c @@ -135,8 +135,8 @@ void xe_drm_client_add_bo(struct xe_drm_client *client, XE_WARN_ON(bo->client); XE_WARN_ON(!list_empty(&bo->client_link)); - spin_lock(&client->bos_lock); bo->client = xe_drm_client_get(client); + spin_lock(&client->bos_lock); list_add_tail(&bo->client_link, &client->bos_list); spin_unlock(&client->bos_lock); } -- GitLab From 53139b3f9998ea07289e7b70b909fea2264a0de9 Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 30 Jan 2025 09:19:31 +0000 Subject: [PATCH 410/989] drm/i915/selftests: avoid using uninitialized context There is an error path in igt_ppgtt_alloc(), which leads to ww object being passed down to i915_gem_ww_ctx_fini() without initialization. Correct that by only putting ppgtt->vm and returning early. Fixes: 480ae79537b2 ("drm/i915/selftests: Prepare gtt tests for obj->mm.lock removal") Signed-off-by: Krzysztof Karas Reviewed-by: Mikolaj Wasiak Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/iuaonpjc3rywmvhna6umjlvzilocn2uqsrxfxfob24e2taocbi@lkaivvfp4777 (cherry picked from commit 8d8334632ea62424233ac6529712868241d0f8df) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 5c397a2df70e2..5d27e1c733c52 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -168,7 +168,7 @@ static int igt_ppgtt_alloc(void *arg) return PTR_ERR(ppgtt); if (!ppgtt->vm.allocate_va_range) - goto err_ppgtt_cleanup; + goto ppgtt_vm_put; /* * While we only allocate the page tables here and so we could @@ -236,7 +236,7 @@ static int igt_ppgtt_alloc(void *arg) goto retry; } i915_gem_ww_ctx_fini(&ww); - +ppgtt_vm_put: i915_vm_put(&ppgtt->vm); return err; } -- GitLab From fd75f371f3a1b04a33d2e750363d6ad76abf734e Mon Sep 17 00:00:00 2001 From: Christian Bruel Date: Mon, 10 Feb 2025 11:35:15 +0100 Subject: [PATCH 411/989] phy: stm32: Fix constant-value overflow assertion Rework the workaround as the lookup tables always fits into the bitfield, and the default values are defined by the hardware and cannot be 0: Guard against false positive with a WARN_ON check to make the compiler happy: The offset range is pre-checked against the sorted imp_lookup_table values and overflow should not happen and would be caught by a warning and return in error. Also guard against a true positive found during the max_vswing lookup, as a max vswing value can be 802000 or 803000 microvolt depending on the current impedance. Therefore set the default impedence index. Fixes: 2de679ecd724 ("phy: stm32: work around constant-value overflow assertion") Signed-off-by: Christian Bruel Link: https://lore.kernel.org/r/20250210103515.2598377-1-christian.bruel@foss.st.com Signed-off-by: Vinod Koul --- drivers/phy/st/phy-stm32-combophy.c | 38 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/drivers/phy/st/phy-stm32-combophy.c b/drivers/phy/st/phy-stm32-combophy.c index 49e9fa90a6819..607b4d607eb5e 100644 --- a/drivers/phy/st/phy-stm32-combophy.c +++ b/drivers/phy/st/phy-stm32-combophy.c @@ -111,6 +111,7 @@ static const struct clk_impedance imp_lookup[] = { { 4204000, { 511000, 609000, 706000, 802000 } }, { 3999000, { 571000, 648000, 726000, 803000 } } }; +#define DEFAULT_IMP_INDEX 3 /* Default impedance is 50 Ohm */ static int stm32_impedance_tune(struct stm32_combophy *combophy) { @@ -119,10 +120,9 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) u8 imp_of, vswing_of; u32 max_imp = imp_lookup[0].microohm; u32 min_imp = imp_lookup[imp_size - 1].microohm; - u32 max_vswing = imp_lookup[imp_size - 1].vswing[vswing_size - 1]; + u32 max_vswing; u32 min_vswing = imp_lookup[0].vswing[0]; u32 val; - u32 regval; if (!of_property_read_u32(combophy->dev->of_node, "st,output-micro-ohms", &val)) { if (val < min_imp || val > max_imp) { @@ -130,45 +130,43 @@ static int stm32_impedance_tune(struct stm32_combophy *combophy) return -EINVAL; } - regval = 0; - for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) { - if (imp_lookup[imp_of].microohm <= val) { - regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of); + for (imp_of = 0; imp_of < ARRAY_SIZE(imp_lookup); imp_of++) + if (imp_lookup[imp_of].microohm <= val) break; - } - } + + if (WARN_ON(imp_of == ARRAY_SIZE(imp_lookup))) + return -EINVAL; dev_dbg(combophy->dev, "Set %u micro-ohms output impedance\n", imp_lookup[imp_of].microohm); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_OHM, - regval); - } else { - regmap_read(combophy->regmap, SYSCFG_PCIEPRGCR, &val); - imp_of = FIELD_GET(STM32MP25_PCIEPRG_IMPCTRL_OHM, val); - } + FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_OHM, imp_of)); + } else + imp_of = DEFAULT_IMP_INDEX; if (!of_property_read_u32(combophy->dev->of_node, "st,output-vswing-microvolt", &val)) { + max_vswing = imp_lookup[imp_of].vswing[vswing_size - 1]; + if (val < min_vswing || val > max_vswing) { dev_err(combophy->dev, "Invalid value %u for output vswing\n", val); return -EINVAL; } - regval = 0; - for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) { - if (imp_lookup[imp_of].vswing[vswing_of] >= val) { - regval = FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of); + for (vswing_of = 0; vswing_of < ARRAY_SIZE(imp_lookup[imp_of].vswing); vswing_of++) + if (imp_lookup[imp_of].vswing[vswing_of] >= val) break; - } - } + + if (WARN_ON(vswing_of == ARRAY_SIZE(imp_lookup[imp_of].vswing))) + return -EINVAL; dev_dbg(combophy->dev, "Set %u microvolt swing\n", imp_lookup[imp_of].vswing[vswing_of]); regmap_update_bits(combophy->regmap, SYSCFG_PCIEPRGCR, STM32MP25_PCIEPRG_IMPCTRL_VSWING, - regval); + FIELD_PREP(STM32MP25_PCIEPRG_IMPCTRL_VSWING, vswing_of)); } return 0; -- GitLab From e2158c953c973adb49383ddea2504faf08d375b7 Mon Sep 17 00:00:00 2001 From: Kaustabh Chakraborty Date: Sun, 9 Feb 2025 00:29:30 +0530 Subject: [PATCH 412/989] phy: exynos5-usbdrd: fix MPLL_MULTIPLIER and SSC_REFCLKSEL masks in refclk In exynos5_usbdrd_{pipe3,utmi}_set_refclk(), the masks PHYCLKRST_MPLL_MULTIPLIER_MASK and PHYCLKRST_SSC_REFCLKSEL_MASK are not inverted when applied to the register values. Fix it. Cc: stable@vger.kernel.org Fixes: 59025887fb08 ("phy: Add new Exynos5 USB 3.0 PHY driver") Signed-off-by: Kaustabh Chakraborty Reviewed-by: Krzysztof Kozlowski Reviewed-by: Anand Moon Link: https://lore.kernel.org/r/20250209-exynos5-usbdrd-masks-v1-1-4f7f83f323d7@disroot.org Signed-off-by: Vinod Koul --- drivers/phy/samsung/phy-exynos5-usbdrd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/phy/samsung/phy-exynos5-usbdrd.c b/drivers/phy/samsung/phy-exynos5-usbdrd.c index c421b495eb0fe..4a108fdab118c 100644 --- a/drivers/phy/samsung/phy-exynos5-usbdrd.c +++ b/drivers/phy/samsung/phy-exynos5-usbdrd.c @@ -488,9 +488,9 @@ exynos5_usbdrd_pipe3_set_refclk(struct phy_usb_instance *inst) reg |= PHYCLKRST_REFCLKSEL_EXT_REFCLK; /* FSEL settings corresponding to reference clock */ - reg &= ~PHYCLKRST_FSEL_PIPE_MASK | - PHYCLKRST_MPLL_MULTIPLIER_MASK | - PHYCLKRST_SSC_REFCLKSEL_MASK; + reg &= ~(PHYCLKRST_FSEL_PIPE_MASK | + PHYCLKRST_MPLL_MULTIPLIER_MASK | + PHYCLKRST_SSC_REFCLKSEL_MASK); switch (phy_drd->extrefclk) { case EXYNOS5_FSEL_50MHZ: reg |= (PHYCLKRST_MPLL_MULTIPLIER_50M_REF | @@ -532,9 +532,9 @@ exynos5_usbdrd_utmi_set_refclk(struct phy_usb_instance *inst) reg &= ~PHYCLKRST_REFCLKSEL_MASK; reg |= PHYCLKRST_REFCLKSEL_EXT_REFCLK; - reg &= ~PHYCLKRST_FSEL_UTMI_MASK | - PHYCLKRST_MPLL_MULTIPLIER_MASK | - PHYCLKRST_SSC_REFCLKSEL_MASK; + reg &= ~(PHYCLKRST_FSEL_UTMI_MASK | + PHYCLKRST_MPLL_MULTIPLIER_MASK | + PHYCLKRST_SSC_REFCLKSEL_MASK); reg |= PHYCLKRST_FSEL(phy_drd->extrefclk); return reg; -- GitLab From 5fb25161217370eeee86b63e47060870b67ed2b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Jan 2025 10:05:03 +1100 Subject: [PATCH 413/989] nfsd: fix uninitialised slot info when a request is retried A recent patch moved the assignment of seq->maxslots from before the test for a resent request (which ends with a goto) to after, resulting in it not being run in that case. This results in the server returning bogus "high slot id" and "target high slot id" values. The assignments to ->maxslots and ->target_maxslots need to be *after* the out: label so that the correct values are returned in replies to requests that are served from cache. Fixes: 60aa6564317d ("nfsd: allocate new session-based DRC slots on demand.") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7a0cfd05401d..153eeea2c7c99 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4459,10 +4459,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } } while (slot && --cnt > 0); } + +out: seq->maxslots = max(session->se_target_maxslots, seq->maxslots); seq->target_maxslots = session->se_target_maxslots; -out: switch (clp->cl_cb_state) { case NFSD4_CB_DOWN: seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN; -- GitLab From d9d6b74e4be989f919498798fa40df37a74b5bb0 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 28 Jan 2025 11:58:06 -0500 Subject: [PATCH 414/989] nfsd: fix __fh_verify for localio __fh_verify() added a call to svc_xprt_set_valid() to help do connection management but during LOCALIO path rqstp argument is NULL, leading to NULL pointer dereferencing and a crash. Fixes: eccbbc7c00a5 ("nfsd: don't use sv_nrthreads in connection limiting calculations.") Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index bf59f83c6224e..91bf0e6d58950 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -381,8 +381,9 @@ __fh_verify(struct svc_rqst *rqstp, error = check_nfsd_access(exp, rqstp, may_bypass_gss); if (error) goto out; - - svc_xprt_set_valid(rqstp->rq_xprt); + /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ + if (rqstp) + svc_xprt_set_valid(rqstp->rq_xprt); /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); -- GitLab From 036ac2778f7b28885814c6fbc07e156ad1624d03 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 30 Jan 2025 11:01:27 -0800 Subject: [PATCH 415/989] NFSD: fix hang in nfsd4_shutdown_callback If nfs4_client is in courtesy state then there is no point to send the callback. This causes nfsd4_shutdown_callback to hang since cl_cb_inflight is not 0. This hang lasts about 15 minutes until TCP notifies NFSD that the connection was dropped. This patch modifies nfsd4_run_cb_work to skip the RPC call if nfs4_client is in courtesy state. Signed-off-by: Dai Ngo Fixes: 66af25799940 ("NFSD: add courteous server support for thread with only delegation") Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 50e468bdb8d48..cf6d29828f4e5 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1583,8 +1583,11 @@ nfsd4_run_cb_work(struct work_struct *work) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; - if (!clnt) { - /* Callback channel broken, or client killed; give up: */ + if (!clnt || clp->cl_state == NFSD4_COURTESY) { + /* + * Callback channel broken, client killed or + * nfs4_client in courtesy state; give up. + */ nfsd41_destroy_cb(cb); return; } -- GitLab From 4990d098433db18c854e75fb0f90d941eb7d479e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 10 Feb 2025 11:43:31 -0500 Subject: [PATCH 416/989] NFSD: Fix CB_GETATTR status fix Jeff says: Now that I look, 1b3e26a5ccbf is wrong. The patch on the ml was correct, but the one that got committed is different. It should be: status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); if (unlikely(status || cb->cb_status)) If "status" is non-zero, decoding failed (usu. BADXDR), but we also want to bail out and not decode the rest of the call if the decoded cb_status is non-zero. That's not happening here, cb_seq_status has already been checked and is non-zero, so this ends up trying to decode the rest of the CB_GETATTR reply when it doesn't exist. Reported-by: Jeff Layton Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219737 Fixes: 1b3e26a5ccbf ("NFSD: fix decoding in nfs4_xdr_dec_cb_getattr") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index cf6d29828f4e5..484077200c5d7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -679,7 +679,7 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, return status; status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); - if (unlikely(status || cb->cb_seq_status)) + if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) return -NFSERR_BAD_XDR; -- GitLab From f3f08c3acfb8860e07a22814a344e83c99ad7398 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 10 Feb 2025 09:27:09 -1000 Subject: [PATCH 417/989] sched_ext: Fix incorrect assumption about migration disabled tasks in task_can_run_on_remote_rq() While fixing migration disabled task handling, 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") assumed that a migration disabled task's ->cpus_ptr would only have the pinned CPU. While this is eventually true for migration disabled tasks that are switched out, ->cpus_ptr update is performed by migrate_disable_switch() which is called right before context_switch() in __scheduler(). However, the task is enqueued earlier during pick_next_task() via put_prev_task_scx(), so there is a race window where another CPU can see the task on a DSQ. If the CPU tries to dispatch the migration disabled task while in that window, task_allowed_on_cpu() will succeed and task_can_run_on_remote_rq() will subsequently trigger SCHED_WARN(is_migration_disabled()). WARNING: CPU: 8 PID: 1837 at kernel/sched/ext.c:2466 task_can_run_on_remote_rq+0x12e/0x140 Sched_ext: layered (enabled+all), task: runnable_at=-10ms RIP: 0010:task_can_run_on_remote_rq+0x12e/0x140 ... consume_dispatch_q+0xab/0x220 scx_bpf_dsq_move_to_local+0x58/0xd0 bpf_prog_84dd17b0654b6cf0_layered_dispatch+0x290/0x1cfa bpf__sched_ext_ops_dispatch+0x4b/0xab balance_one+0x1fe/0x3b0 balance_scx+0x61/0x1d0 prev_balance+0x46/0xc0 __pick_next_task+0x73/0x1c0 __schedule+0x206/0x1730 schedule+0x3a/0x160 __do_sys_sched_yield+0xe/0x20 do_syscall_64+0xbb/0x1e0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fix it by converting the SCHED_WARN() back to a regular failure path. Also, perform the migration disabled test before task_allowed_on_cpu() test so that BPF schedulers which fail to handle migration disabled tasks can be noticed easily. While at it, adjust scx_ops_error() message for !task_allowed_on_cpu() case for brevity and consistency. Signed-off-by: Tejun Heo Fixes: 32966821574c ("sched_ext: Fix migration disabled handling in targeted dispatches") Acked-by: Andrea Righi Reported-by: Jake Hillion --- kernel/sched/ext.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index e01144340d679..54edd0e2132a6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2343,6 +2343,25 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, SCHED_WARN_ON(task_cpu(p) == cpu); + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (trigger_error) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2351,17 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, */ if (!task_allowed_on_cpu(p, cpu)) { if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - /* - * If @p has migration disabled, @p->cpus_ptr only contains its current - * CPU and the above task_allowed_on_cpu() test should have failed. - */ - SCHED_WARN_ON(is_migration_disabled(p)); - if (!scx_rq_online(rq)) return false; -- GitLab From cb6cc8ed77177c7553c2f8ac8605d32de58f43ac Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Fri, 7 Feb 2025 16:56:39 +0800 Subject: [PATCH 418/989] net: stmmac: Apply new page pool parameters when SPH is enabled Commit df542f669307 ("net: stmmac: Switch to zero-copy in non-XDP RX path") makes DMA write received frame into buffer at offset of NET_SKB_PAD and sets page pool parameters to sync from offset of NET_SKB_PAD. But when Header Payload Split is enabled, the header is written at offset of NET_SKB_PAD, while the payload is written at offset of zero. Uncorrect offset parameter for the payload breaks dma coherence [1] since both CPU and DMA touch the page buffer from offset of zero which is not handled by the page pool sync parameter. And in case the DMA cannot split the received frame, for example, a large L2 frame, pp_params.max_len should grow to match the tail of entire frame. [1] https://lore.kernel.org/netdev/d465f277-bac7-439f-be1d-9a47dfe2d951@nvidia.com/ Reported-by: Jon Hunter Reported-by: Brad Griffis Suggested-by: Ido Schimmel Fixes: df542f669307 ("net: stmmac: Switch to zero-copy in non-XDP RX path") Signed-off-by: Furong Xu <0x1207@gmail.com> Tested-by: Jon Hunter Tested-by: Thierry Reding Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250207085639.13580-1-0x1207@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b34ebb916b898..c0ae7db96f46f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2094,6 +2094,11 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, pp_params.offset = stmmac_rx_offset(priv); pp_params.max_len = dma_conf->dma_buf_sz; + if (priv->sph) { + pp_params.offset = 0; + pp_params.max_len += stmmac_rx_offset(priv); + } + rx_q->page_pool = page_pool_create(&pp_params); if (IS_ERR(rx_q->page_pool)) { ret = PTR_ERR(rx_q->page_pool); -- GitLab From 48145a57d4bbe3496e8e4880b23ea6b511e6e519 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:33 +0000 Subject: [PATCH 419/989] ndisc: ndisc_send_redirect() must use dev_get_by_index_rcu() ndisc_send_redirect() is called under RCU protection, not RTNL. It must use dev_get_by_index_rcu() instead of __dev_get_by_index() Fixes: 2f17becfbea5 ("vrf: check the original netdevice for generating redirect") Signed-off-by: Eric Dumazet Cc: Stephen Suryaputra Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d044c67019de6..264b10a947577 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1694,7 +1694,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) bool ret; if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + dev = dev_get_by_index_rcu(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) return; } -- GitLab From 628e6d18930bbd21f2d4562228afe27694f66da9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:34 +0000 Subject: [PATCH 420/989] ndisc: use RCU protection in ndisc_alloc_skb() ndisc_alloc_skb() can be called without RTNL or RCU being held. Add RCU protection to avoid possible UAF. Fixes: de09334b9326 ("ndisc: Introduce ndisc_alloc_skb() helper.") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 264b10a947577..90f8aa2d7af2e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -418,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -437,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } -- GitLab From becbd5850c03ed33b232083dd66c6e38c0c0e569 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:35 +0000 Subject: [PATCH 421/989] neighbour: use RCU protection in __neigh_notify() __neigh_notify() can be called without RTNL or RCU protection. Use RCU protection to avoid potential UAF. Fixes: 426b5303eb43 ("[NETNS]: Modify the neighbour table code so it handles multiple network namespaces") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 89656d180bc60..bd0251bd74a1f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3447,10 +3447,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3463,9 +3465,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) -- GitLab From a42b69f692165ec39db42d595f4f65a4c8f42e44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:36 +0000 Subject: [PATCH 422/989] arp: use RCU protection in arp_xmit() arp_xmit() can be called without RTNL or RCU protection. Use RCU protection to avoid potential UAF. Fixes: 29a26a568038 ("netfilter: Pass struct net into the netfilter hooks") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/arp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cb9a7ed8abd3a..f23a1ec6694cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb */ void arp_xmit(struct sk_buff *skb) { + rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, - dev_net(skb->dev), NULL, skb, NULL, skb->dev, + dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); + rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); -- GitLab From 90b2f49a502fa71090d9f4fe29a2f51fe5dff76d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:37 +0000 Subject: [PATCH 423/989] openvswitch: use RCU protection in ovs_vport_cmd_fill_info() ovs_vport_cmd_fill_info() can be called without RTNL or RCU. Use RCU protection and dev_net_rcu() to avoid potential UAF. Fixes: 9354d4520342 ("openvswitch: reliable interface indentification in port dumps") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 225f6048867f4..5d548eda742df 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2101,6 +2101,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -2117,12 +2118,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -2143,6 +2147,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: -- GitLab From 6d0ce46a93135d96b7fa075a94a88fe0da8e8773 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:38 +0000 Subject: [PATCH 424/989] vrf: use RCU protection in l3mdev_l3_out() l3mdev_l3_out() can be called without RCU being held: raw_sendmsg() ip_push_pending_frames() ip_send_skb() ip_local_out() __ip_local_out() l3mdev_ip_out() Add rcu_read_lock() / rcu_read_unlock() pair to avoid a potential UAF. Fixes: a8e3e1a9f020 ("net: l3mdev: Add hook to output path") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/l3mdev.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 2d6141f28b530..f7fe796e8429a 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -198,10 +198,12 @@ struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) if (netif_is_l3_slave(dev)) { struct net_device *master; + rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); + rcu_read_unlock(); } return skb; -- GitLab From ed6ae1f325d3c43966ec1b62ac1459e2b8e45640 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:39 +0000 Subject: [PATCH 425/989] ndisc: extend RCU protection in ndisc_send_skb() ndisc_send_skb() can be called without RTNL or RCU held. Acquire rcu_read_lock() earlier, so that we can use dev_net_rcu() and avoid a potential UAF. Fixes: 1762f7e88eb3 ("[NETNS][IPV6] ndisc - make socket control per namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ndisc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 90f8aa2d7af2e..8699d1a188dc4 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -471,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -488,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -502,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); -- GitLab From 087c1faa594fa07a66933d750c0b2610aa1a2946 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Feb 2025 13:58:40 +0000 Subject: [PATCH 426/989] ipv6: mcast: extend RCU protection in igmp6_send() igmp6_send() can be called without RTNL or RCU being held. Extend RCU protection so that we can safely fetch the net pointer and avoid a potential UAF. Note that we no longer can use sock_alloc_send_skb() because ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. Instead use alloc_skb() and charge the net->ipv6.igmp_sk socket under RCU protection. Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250207135841.1948589-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9dfdb40988b0f..81a739ebf7094 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2165,21 +2165,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2190,19 +2190,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2227,9 +2229,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); -- GitLab From de1d0d160f64ee76df1d364d521b2faf465a091c Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:00 +0100 Subject: [PATCH 427/989] gpio: bcm-kona: Fix GPIO lock/unlock for banks above bank 0 The GPIO lock/unlock functions clear/write a bit to the relevant register for each bank. However, due to an oversight the bit that was being written was based on the total GPIO number, not the index of the GPIO within the relevant bank, causing it to fail for any GPIO above 32 (thus any GPIO for banks above bank 0). Fix lock/unlock for these banks by using the correct bit. Fixes: bdb93c03c550 ("gpio: bcm281xx: Centralize register locking") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-1-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 5321ef98f4427..77bd4ec93a231 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -86,11 +86,12 @@ static void bcm_kona_gpio_lock_gpio(struct bcm_kona_gpio *kona_gpio, u32 val; unsigned long flags; int bank_id = GPIO_BANK(gpio); + int bit = GPIO_BIT(gpio); raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val |= BIT(gpio); + val |= BIT(bit); bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); @@ -102,11 +103,12 @@ static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, u32 val; unsigned long flags; int bank_id = GPIO_BANK(gpio); + int bit = GPIO_BIT(gpio); raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val &= ~BIT(gpio); + val &= ~BIT(bit); bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); -- GitLab From 57f5db77a915cc29461a679a6bcae7097967be1a Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:01 +0100 Subject: [PATCH 428/989] gpio: bcm-kona: Make sure GPIO bits are unlocked when requesting IRQ The settings for all GPIOs are locked by default in bcm_kona_gpio_reset. The settings for a GPIO are unlocked when requesting it as a GPIO, but not when requesting it as an interrupt, causing the IRQ settings to not get applied. Fix this by making sure to unlock the right bits when an IRQ is requested. To avoid a situation where an IRQ being released causes a lock despite the same GPIO being used by a GPIO request or vice versa, add an unlock counter and only lock if it reaches 0. Fixes: 757651e3d60e ("gpio: bcm281xx: Add GPIO driver") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-2-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 67 +++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 77bd4ec93a231..17f3f210fee9d 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -69,6 +69,22 @@ struct bcm_kona_gpio { struct bcm_kona_gpio_bank { int id; int irq; + /* + * Used to keep track of lock/unlock operations for each GPIO in the + * bank. + * + * All GPIOs are locked by default (see bcm_kona_gpio_reset), and the + * unlock count for all GPIOs is 0 by default. Each unlock increments + * the counter, and each lock decrements the counter. + * + * The lock function only locks the GPIO once its unlock counter is + * down to 0. This is necessary because the GPIO is unlocked in two + * places in this driver: once for requested GPIOs, and once for + * requested IRQs. Since it is possible for a GPIO to be requested + * as both a GPIO and an IRQ, we need to ensure that we don't lock it + * too early. + */ + u8 gpio_unlock_count[GPIO_PER_BANK]; /* Used in the interrupt handler */ struct bcm_kona_gpio *kona_gpio; }; @@ -87,14 +103,23 @@ static void bcm_kona_gpio_lock_gpio(struct bcm_kona_gpio *kona_gpio, unsigned long flags; int bank_id = GPIO_BANK(gpio); int bit = GPIO_BIT(gpio); + struct bcm_kona_gpio_bank *bank = &kona_gpio->banks[bank_id]; - raw_spin_lock_irqsave(&kona_gpio->lock, flags); + if (bank->gpio_unlock_count[bit] == 0) { + dev_err(kona_gpio->gpio_chip.parent, + "Unbalanced locks for GPIO %u\n", gpio); + return; + } - val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val |= BIT(bit); - bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + if (--bank->gpio_unlock_count[bit] == 0) { + raw_spin_lock_irqsave(&kona_gpio->lock, flags); - raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); + val |= BIT(bit); + bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + + raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + } } static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, @@ -104,14 +129,19 @@ static void bcm_kona_gpio_unlock_gpio(struct bcm_kona_gpio *kona_gpio, unsigned long flags; int bank_id = GPIO_BANK(gpio); int bit = GPIO_BIT(gpio); + struct bcm_kona_gpio_bank *bank = &kona_gpio->banks[bank_id]; - raw_spin_lock_irqsave(&kona_gpio->lock, flags); + if (bank->gpio_unlock_count[bit] == 0) { + raw_spin_lock_irqsave(&kona_gpio->lock, flags); - val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); - val &= ~BIT(bit); - bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); + val = readl(kona_gpio->reg_base + GPIO_PWD_STATUS(bank_id)); + val &= ~BIT(bit); + bcm_kona_gpio_write_lock_regs(kona_gpio->reg_base, bank_id, val); - raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + raw_spin_unlock_irqrestore(&kona_gpio->lock, flags); + } + + ++bank->gpio_unlock_count[bit]; } static int bcm_kona_gpio_get_dir(struct gpio_chip *chip, unsigned gpio) @@ -362,6 +392,7 @@ static void bcm_kona_gpio_irq_mask(struct irq_data *d) kona_gpio = irq_data_get_irq_chip_data(d); reg_base = kona_gpio->reg_base; + raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(reg_base + GPIO_INT_MASK(bank_id)); @@ -384,6 +415,7 @@ static void bcm_kona_gpio_irq_unmask(struct irq_data *d) kona_gpio = irq_data_get_irq_chip_data(d); reg_base = kona_gpio->reg_base; + raw_spin_lock_irqsave(&kona_gpio->lock, flags); val = readl(reg_base + GPIO_INT_MSKCLR(bank_id)); @@ -479,15 +511,26 @@ static void bcm_kona_gpio_irq_handler(struct irq_desc *desc) static int bcm_kona_gpio_irq_reqres(struct irq_data *d) { struct bcm_kona_gpio *kona_gpio = irq_data_get_irq_chip_data(d); + unsigned int gpio = d->hwirq; + + /* + * We need to unlock the GPIO before any other operations are performed + * on the relevant GPIO configuration registers + */ + bcm_kona_gpio_unlock_gpio(kona_gpio, gpio); - return gpiochip_reqres_irq(&kona_gpio->gpio_chip, d->hwirq); + return gpiochip_reqres_irq(&kona_gpio->gpio_chip, gpio); } static void bcm_kona_gpio_irq_relres(struct irq_data *d) { struct bcm_kona_gpio *kona_gpio = irq_data_get_irq_chip_data(d); + unsigned int gpio = d->hwirq; + + /* Once we no longer use it, lock the GPIO again */ + bcm_kona_gpio_lock_gpio(kona_gpio, gpio); - gpiochip_relres_irq(&kona_gpio->gpio_chip, d->hwirq); + gpiochip_relres_irq(&kona_gpio->gpio_chip, gpio); } static struct irq_chip bcm_gpio_irq_chip = { -- GitLab From 615279db222c3ac56d5c93716efd72b843295c1f Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Thu, 6 Feb 2025 18:46:02 +0100 Subject: [PATCH 429/989] gpio: bcm-kona: Add missing newline to dev_err format string Add a missing newline to the format string of the "Couldn't get IRQ for bank..." error message. Fixes: 757651e3d60e ("gpio: bcm281xx: Add GPIO driver") Reviewed-by: Florian Fainelli Reviewed-by: Markus Mayer Signed-off-by: Artur Weber Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250206-kona-gpio-fixes-v2-3-409135eab780@gmail.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bcm-kona.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c index 17f3f210fee9d..64908f1a5e7f9 100644 --- a/drivers/gpio/gpio-bcm-kona.c +++ b/drivers/gpio/gpio-bcm-kona.c @@ -659,7 +659,7 @@ static int bcm_kona_gpio_probe(struct platform_device *pdev) bank->irq = platform_get_irq(pdev, i); bank->kona_gpio = kona_gpio; if (bank->irq < 0) { - dev_err(dev, "Couldn't get IRQ for bank %d", i); + dev_err(dev, "Couldn't get IRQ for bank %d\n", i); ret = -ENOENT; goto err_irq_domain; } -- GitLab From 7b07b040257c1b658ef3eca86e4b6ae02d65069c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 7 Feb 2025 10:39:02 +0100 Subject: [PATCH 430/989] ptp: vmclock: Add .owner to vmclock_miscdev_fops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without the .owner field, the module can be unloaded while /dev/vmclock0 is open, leading to an oops. Fixes: 205032724226 ("ptp: Add support for the AMZNC10C 'vmclock' device") Cc: stable@vger.kernel.org Signed-off-by: David Woodhouse Signed-off-by: Thomas Weißschuh Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_vmclock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 0a2cfc8ad3c54..dbc73e5382935 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -414,6 +414,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, } static const struct file_operations vmclock_miscdev_fops = { + .owner = THIS_MODULE, .mmap = vmclock_miscdev_mmap, .read = vmclock_miscdev_read, }; -- GitLab From f7d07cd4f77d77f366c8ffbb8ba8b61f614e5fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 7 Feb 2025 10:39:03 +0100 Subject: [PATCH 431/989] ptp: vmclock: Set driver data before its usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If vmclock_ptp_register() fails during probing, vmclock_remove() is called to clean up the ptp clock and misc device. It uses dev_get_drvdata() to access the vmclock state. However the driver data is not yet set at this point. Assign the driver data earlier. Fixes: 205032724226 ("ptp: Add support for the AMZNC10C 'vmclock' device") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Mateusz Polchlopek Acked-by: Richard Cochran Reviewed-by: David Woodhouse Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_vmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index dbc73e5382935..1ba30a2da570f 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -525,6 +525,8 @@ static int vmclock_probe(struct platform_device *pdev) goto out; } + dev_set_drvdata(dev, st); + if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || le32_to_cpu(st->clk->size) > resource_size(&st->res) || le16_to_cpu(st->clk->version) != 1) { @@ -588,8 +590,6 @@ static int vmclock_probe(struct platform_device *pdev) (st->miscdev.minor && st->ptp_clock) ? ", " : "", st->ptp_clock ? "PTP" : ""); - dev_set_drvdata(dev, st); - out: return ret; } -- GitLab From 39e926c3a21b25af6cae479fbb752f193240ce03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 7 Feb 2025 10:39:04 +0100 Subject: [PATCH 432/989] ptp: vmclock: Don't unregister misc device if it was not registered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmclock_remove() tries to detect the successful registration of the misc device based on the value of its minor value. However that check is incorrect if the misc device registration was not attempted in the first place. Always initialize the minor number, so the check works properly. Fixes: 205032724226 ("ptp: Add support for the AMZNC10C 'vmclock' device") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Acked-by: Richard Cochran Reviewed-by: David Woodhouse Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_vmclock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 1ba30a2da570f..9b8bd626a3973 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -550,6 +550,8 @@ static int vmclock_probe(struct platform_device *pdev) goto out; } + st->miscdev.minor = MISC_DYNAMIC_MINOR; + /* * If the structure is big enough, it can be mapped to userspace. * Theoretically a guest OS even using larger pages could still @@ -557,7 +559,6 @@ static int vmclock_probe(struct platform_device *pdev) * cross that bridge if/when we come to it. */ if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { - st->miscdev.minor = MISC_DYNAMIC_MINOR; st->miscdev.fops = &vmclock_miscdev_fops; st->miscdev.name = st->name; -- GitLab From 9a884c3800b207bac36e27be4ec7277c78a84568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 7 Feb 2025 10:39:05 +0100 Subject: [PATCH 433/989] ptp: vmclock: Clean up miscdev and ptp clock through devres MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most resources owned by the vmclock device are managed through devres. Only the miscdev and ptp clock are managed manually. This makes the code slightly harder to understand than necessary. Switch them over to devres and remove the now unnecessary drvdata. Signed-off-by: Thomas Weißschuh Acked-by: Richard Cochran Reviewed-by: David Woodhouse Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_vmclock.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 9b8bd626a3973..09c023fb94b7e 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -421,10 +421,9 @@ static const struct file_operations vmclock_miscdev_fops = { /* module operations */ -static void vmclock_remove(struct platform_device *pdev) +static void vmclock_remove(void *data) { - struct device *dev = &pdev->dev; - struct vmclock_state *st = dev_get_drvdata(dev); + struct vmclock_state *st = data; if (st->ptp_clock) ptp_clock_unregister(st->ptp_clock); @@ -525,8 +524,6 @@ static int vmclock_probe(struct platform_device *pdev) goto out; } - dev_set_drvdata(dev, st); - if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || le32_to_cpu(st->clk->size) > resource_size(&st->res) || le16_to_cpu(st->clk->version) != 1) { @@ -552,6 +549,10 @@ static int vmclock_probe(struct platform_device *pdev) st->miscdev.minor = MISC_DYNAMIC_MINOR; + ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st); + if (ret) + goto out; + /* * If the structure is big enough, it can be mapped to userspace. * Theoretically a guest OS even using larger pages could still @@ -574,7 +575,6 @@ static int vmclock_probe(struct platform_device *pdev) if (IS_ERR(st->ptp_clock)) { ret = PTR_ERR(st->ptp_clock); st->ptp_clock = NULL; - vmclock_remove(pdev); goto out; } } @@ -603,7 +603,6 @@ MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); static struct platform_driver vmclock_platform_driver = { .probe = vmclock_probe, - .remove = vmclock_remove, .driver = { .name = "vmclock", .acpi_match_table = vmclock_acpi_ids, -- GitLab From b4c1fde5ced93d9f4ad89e2c940d3fd56ad82288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 7 Feb 2025 10:39:06 +0100 Subject: [PATCH 434/989] ptp: vmclock: Remove goto-based cleanup logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmclock_probe() uses an "out:" label to return from the function on error. This indicates that some cleanup operation is necessary. However the label does not do anything as all resources are managed through devres, making the code slightly harder to read. Remove the label and just return directly. Signed-off-by: Thomas Weißschuh Acked-by: Richard Cochran Reviewed-by: David Woodhouse Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_vmclock.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c index 09c023fb94b7e..b3a83b03d9c14 100644 --- a/drivers/ptp/ptp_vmclock.c +++ b/drivers/ptp/ptp_vmclock.c @@ -506,14 +506,13 @@ static int vmclock_probe(struct platform_device *pdev) if (ret) { dev_info(dev, "Failed to obtain physical address: %d\n", ret); - goto out; + return ret; } if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { dev_info(dev, "Region too small (0x%llx)\n", resource_size(&st->res)); - ret = -EINVAL; - goto out; + return -EINVAL; } st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), MEMREMAP_WB | MEMREMAP_DEC); @@ -521,37 +520,34 @@ static int vmclock_probe(struct platform_device *pdev) ret = PTR_ERR(st->clk); dev_info(dev, "failed to map shared memory\n"); st->clk = NULL; - goto out; + return ret; } if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || le32_to_cpu(st->clk->size) > resource_size(&st->res) || le16_to_cpu(st->clk->version) != 1) { dev_info(dev, "vmclock magic fields invalid\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } ret = ida_alloc(&vmclock_ida, GFP_KERNEL); if (ret < 0) - goto out; + return ret; st->index = ret; ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); if (ret) - goto out; + return ret; st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); - if (!st->name) { - ret = -ENOMEM; - goto out; - } + if (!st->name) + return -ENOMEM; st->miscdev.minor = MISC_DYNAMIC_MINOR; ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st); if (ret) - goto out; + return ret; /* * If the structure is big enough, it can be mapped to userspace. @@ -565,7 +561,7 @@ static int vmclock_probe(struct platform_device *pdev) ret = misc_register(&st->miscdev); if (ret) - goto out; + return ret; } /* If there is valid clock information, register a PTP clock */ @@ -575,15 +571,14 @@ static int vmclock_probe(struct platform_device *pdev) if (IS_ERR(st->ptp_clock)) { ret = PTR_ERR(st->ptp_clock); st->ptp_clock = NULL; - goto out; + return ret; } } if (!st->miscdev.minor && !st->ptp_clock) { /* Neither miscdev nor PTP registered */ dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); - ret = -ENODEV; - goto out; + return -ENODEV; } dev_info(dev, "%s: registered %s%s%s\n", st->name, @@ -591,8 +586,7 @@ static int vmclock_probe(struct platform_device *pdev) (st->miscdev.minor && st->ptp_clock) ? ", " : "", st->ptp_clock ? "PTP" : ""); - out: - return ret; + return 0; } static const struct acpi_device_id vmclock_acpi_ids[] = { -- GitLab From 86ede0a61f8576a84bb0a93c5d9861d2ec1cdf9a Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sun, 9 Feb 2025 15:09:38 +0100 Subject: [PATCH 435/989] mtd: rawnand: qcom: fix broken config in qcom_param_page_type_exec Fix broken config in qcom_param_page_type_exec caused by copy-paste error from commit 0c08080fd71c ("mtd: rawnand: qcom: use FIELD_PREP and GENMASK") In qcom_param_page_type_exec the value needs to be set to nandc->regs->cfg0 instead of host->cfg0. This wrong configuration caused the Qcom NANDC driver to malfunction on any device that makes use of it (IPQ806x, IPQ40xx, IPQ807x, IPQ60xx) with the following error: [ 0.885369] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xaa [ 0.885909] nand: Micron NAND 256MiB 1,8V 8-bit [ 0.892499] nand: 256 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64 [ 0.896823] nand: ECC (step, strength) = (512, 8) does not fit in OOB [ 0.896836] qcom-nandc 79b0000.nand-controller: No valid ECC settings possible [ 0.910996] bam-dma-engine 7984000.dma-controller: Cannot free busy channel [ 0.918070] qcom-nandc: probe of 79b0000.nand-controller failed with error -28 Restore original configuration fix the problem and makes the driver work again. Also restore the wrongly dropped cpu_to_le32 to correctly support BE systems. Cc: stable@vger.kernel.org Fixes: 0c08080fd71c ("mtd: rawnand: qcom: use FIELD_PREP and GENMASK") Tested-by: Robert Marko # IPQ8074 and IPQ6018 Signed-off-by: Christian Marangi Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/qcom_nandc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index d2d2aeee42a7e..6720b547892ba 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -1881,18 +1881,18 @@ static int qcom_param_page_type_exec(struct nand_chip *chip, const struct nand_ nandc->regs->addr0 = 0; nandc->regs->addr1 = 0; - host->cfg0 = FIELD_PREP(CW_PER_PAGE_MASK, 0) | - FIELD_PREP(UD_SIZE_BYTES_MASK, 512) | - FIELD_PREP(NUM_ADDR_CYCLES_MASK, 5) | - FIELD_PREP(SPARE_SIZE_BYTES_MASK, 0); - - host->cfg1 = FIELD_PREP(NAND_RECOVERY_CYCLES_MASK, 7) | - FIELD_PREP(BAD_BLOCK_BYTE_NUM_MASK, 17) | - FIELD_PREP(CS_ACTIVE_BSY, 0) | - FIELD_PREP(BAD_BLOCK_IN_SPARE_AREA, 1) | - FIELD_PREP(WR_RD_BSY_GAP_MASK, 2) | - FIELD_PREP(WIDE_FLASH, 0) | - FIELD_PREP(DEV0_CFG1_ECC_DISABLE, 1); + nandc->regs->cfg0 = cpu_to_le32(FIELD_PREP(CW_PER_PAGE_MASK, 0) | + FIELD_PREP(UD_SIZE_BYTES_MASK, 512) | + FIELD_PREP(NUM_ADDR_CYCLES_MASK, 5) | + FIELD_PREP(SPARE_SIZE_BYTES_MASK, 0)); + + nandc->regs->cfg1 = cpu_to_le32(FIELD_PREP(NAND_RECOVERY_CYCLES_MASK, 7) | + FIELD_PREP(BAD_BLOCK_BYTE_NUM_MASK, 17) | + FIELD_PREP(CS_ACTIVE_BSY, 0) | + FIELD_PREP(BAD_BLOCK_IN_SPARE_AREA, 1) | + FIELD_PREP(WR_RD_BSY_GAP_MASK, 2) | + FIELD_PREP(WIDE_FLASH, 0) | + FIELD_PREP(DEV0_CFG1_ECC_DISABLE, 1)); if (!nandc->props->qpic_version2) nandc->regs->ecc_buf_cfg = cpu_to_le32(ECC_CFG_ECC_DISABLE); -- GitLab From db79e75460fc59b19f9c89d4b068e61cee59f37d Mon Sep 17 00:00:00 2001 From: "Chester A. Unal" Date: Fri, 24 Jan 2025 10:28:00 +0000 Subject: [PATCH 436/989] USB: serial: option: add MeiG Smart SLM828 MeiG Smart SLM828 is an LTE-A CAT6 modem with the mPCIe form factor. The "Cls=ff(vend.) Sub=10 Prot=02" and "Cls=ff(vend.) Sub=10 Prot=03" interfaces respond to AT commands. Add these interfaces. The product ID the modem uses is shared across multiple modems. Therefore, add comments to describe which interface is used for which modem. T: Bus=01 Lev=01 Prnt=05 Port=01 Cnt=01 Dev#= 6 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2dee ProdID=4d22 Rev=05.04 S: Manufacturer=MEIG S: Product=LTE-A Module S: SerialNumber=4da7ec42 C: #Ifs= 6 Cfg#= 1 Atr=80 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=10 Prot=01 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=02 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=03 Driver=(none) E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=04 Driver=(none) E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=10 Prot=05 Driver=qmi_wwan E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Chester A. Unal Link: https://lore.kernel.org/20250124-for-johan-meig-slm828-v2-1-6b4cd3f6344f@arinc9.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 1e2ae0c6c41c7..887a1c687b52c 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -621,7 +621,10 @@ static void option_instat_callback(struct urb *urb); /* MeiG Smart Technology products */ #define MEIGSMART_VENDOR_ID 0x2dee -/* MeiG Smart SRM815/SRM825L based on Qualcomm 315 */ +/* + * MeiG Smart SLM828, SRM815, and SRM825L use the same product ID. SLM828 is + * based on Qualcomm SDX12. SRM815 and SRM825L are based on Qualcomm 315. + */ #define MEIGSMART_PRODUCT_SRM825L 0x4d22 /* MeiG Smart SLM320 based on UNISOC UIS8910 */ #define MEIGSMART_PRODUCT_SLM320 0x4d41 @@ -2405,10 +2408,12 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ .driver_info = NCTRL(1) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ -- GitLab From 5728c92ae112301936006c5e305677beb1a7f578 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 24 Jan 2025 13:16:44 -0600 Subject: [PATCH 437/989] mfd: syscon: Restore device_node_to_regmap() for non-syscon nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ba5095ebbc7a ("mfd: syscon: Allow syscon nodes without a "syscon" compatible") broke drivers which call device_node_to_regmap() on nodes without a "syscon" compatible. Restore the prior behavior for device_node_to_regmap(). This also makes using device_node_to_regmap() incompatible with of_syscon_register_regmap() again, so add kerneldoc for device_node_to_regmap() and syscon_node_to_regmap() to make it clear how and when each one should be used. Fixes: ba5095ebbc7a ("mfd: syscon: Allow syscon nodes without a "syscon" compatible") Reported-by: Vaishnav Achath Signed-off-by: Rob Herring (Arm) Reviewed-by: Daniel Golle Reviewed-by: AngeloGioacchino Del Regno Tested-by: Chen-Yu Tsai Tested-by: Nishanth Menon Tested-by: Daniel Golle Tested-by: Frank Wunderlich Tested-by: Dhruva Gole Tested-by: Nícolas F. R. A. Prado Tested-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20250124191644.2309790-1-robh@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/syscon.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c index 226915ca3c93d..aa4a9940b569a 100644 --- a/drivers/mfd/syscon.c +++ b/drivers/mfd/syscon.c @@ -159,6 +159,7 @@ static struct syscon *of_syscon_register(struct device_node *np, bool check_res) } static struct regmap *device_node_get_regmap(struct device_node *np, + bool create_regmap, bool check_res) { struct syscon *entry, *syscon = NULL; @@ -172,7 +173,7 @@ static struct regmap *device_node_get_regmap(struct device_node *np, } if (!syscon) { - if (of_device_is_compatible(np, "syscon")) + if (create_regmap) syscon = of_syscon_register(np, check_res); else syscon = ERR_PTR(-EINVAL); @@ -233,15 +234,37 @@ int of_syscon_register_regmap(struct device_node *np, struct regmap *regmap) } EXPORT_SYMBOL_GPL(of_syscon_register_regmap); +/** + * device_node_to_regmap() - Get or create a regmap for specified device node + * @np: Device tree node + * + * Get a regmap for the specified device node. If there's not an existing + * regmap, then one is instantiated. This function should not be used if the + * device node has a custom regmap driver or has resources (clocks, resets) to + * be managed. Use syscon_node_to_regmap() instead for those cases. + * + * Return: regmap ptr on success, negative error code on failure. + */ struct regmap *device_node_to_regmap(struct device_node *np) { - return device_node_get_regmap(np, false); + return device_node_get_regmap(np, true, false); } EXPORT_SYMBOL_GPL(device_node_to_regmap); +/** + * syscon_node_to_regmap() - Get or create a regmap for specified syscon device node + * @np: Device tree node + * + * Get a regmap for the specified device node. If there's not an existing + * regmap, then one is instantiated if the node is a generic "syscon". This + * function is safe to use for a syscon registered with + * of_syscon_register_regmap(). + * + * Return: regmap ptr on success, negative error code on failure. + */ struct regmap *syscon_node_to_regmap(struct device_node *np) { - return device_node_get_regmap(np, true); + return device_node_get_regmap(np, of_device_is_compatible(np, "syscon"), true); } EXPORT_SYMBOL_GPL(syscon_node_to_regmap); -- GitLab From c979fb5ece2dc11cc9cc3d5c66f750e210bfdee2 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 5 Feb 2025 18:16:45 +0100 Subject: [PATCH 438/989] USB: serial: option: add Telit Cinterion FN990B compositions Add the following Telit Cinterion FN990B40 compositions: 0x10d0: rmnet + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 17 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d0 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs= 9 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 6 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d1: MBIM + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d1 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=82(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d2: RNDIS + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 18 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d2 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=ef(misc ) Sub=04 Prot=01 Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms 0x10d3: ECM + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (AT) + tty (diag) + DPL + QDSS (Qualcomm Debug SubSystem) + adb T: Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#= 20 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=10d3 Rev=05.15 S: Manufacturer=Telit Cinterion S: Product=FN990 S: SerialNumber=43b38f19 C: #Ifs=10 Cfg#= 1 Atr=e0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=60 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 6 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=06(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8b(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 7 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=80 Driver=(none) E: Ad=8c(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 8 Alt= 0 #EPs= 1 Cls=ff(vend.) Sub=ff Prot=70 Driver=(none) E: Ad=8d(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 9 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=usbfs E: Ad=07(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Reviewed-by: Daniele Palmas Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 887a1c687b52c..7f6eff505085d 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1406,6 +1406,22 @@ static const struct usb_device_id option_ids[] = { .driver_info = RSVD(0) | NCTRL(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x10c8, 0xff), /* Telit FE910C04 (rmnet) */ .driver_info = RSVD(0) | NCTRL(2) | RSVD(3) | RSVD(4) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x60) }, /* Telit FN990B (rmnet) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d0, 0x30), + .driver_info = NCTRL(5) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x60) }, /* Telit FN990B (MBIM) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d1, 0x30), + .driver_info = NCTRL(6) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x60) }, /* Telit FN990B (RNDIS) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d2, 0x30), + .driver_info = NCTRL(6) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x60) }, /* Telit FN990B (ECM) */ + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x40) }, + { USB_DEVICE_INTERFACE_PROTOCOL(TELIT_VENDOR_ID, 0x10d3, 0x30), + .driver_info = NCTRL(6) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), -- GitLab From 12606fe73f33647c5e79bf666833bf0b225e649d Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Wed, 5 Feb 2025 18:16:47 +0100 Subject: [PATCH 439/989] USB: serial: option: fix Telit Cinterion FN990A name The correct name for FN990 is FN990A so use it in order to avoid confusion with FN990B. Signed-off-by: Fabio Porcedda Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 7f6eff505085d..4a59a40f750a6 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1370,15 +1370,15 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1063, 0xff), /* Telit LN920 (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1070, 0xff), /* Telit FN990 (rmnet) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1070, 0xff), /* Telit FN990A (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1071, 0xff), /* Telit FN990 (MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1071, 0xff), /* Telit FN990A (MBIM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1072, 0xff), /* Telit FN990 (RNDIS) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1072, 0xff), /* Telit FN990A (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1073, 0xff), /* Telit FN990 (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1073, 0xff), /* Telit FN990A (ECM) */ .driver_info = NCTRL(0) | RSVD(1) }, - { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990 (PCIe) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1075, 0xff), /* Telit FN990A (PCIe) */ .driver_info = RSVD(0) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1080, 0xff), /* Telit FE990 (rmnet) */ .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, -- GitLab From 1c316eb57c11fb3dc447b04ef765459cd61c8647 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Mon, 10 Feb 2025 11:04:22 +0800 Subject: [PATCH 440/989] bcachefs: Fix use after free acc->k.data should be used with the lock hold: 00221 ========= TEST generic/187 00221 run fstests generic/187 at 2025-02-09 21:08:10 00221 spectre-v4 mitigation disabled by command-line option 00222 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00222 bcachefs (vdc): initializing new filesystem 00222 bcachefs (vdc): going read-write 00222 bcachefs (vdc): marking superblocks 00222 bcachefs (vdc): initializing freespace 00222 bcachefs (vdc): done initializing freespace 00222 bcachefs (vdc): reading snapshots table 00222 bcachefs (vdc): reading snapshots done 00222 bcachefs (vdc): done starting filesystem 00222 bcachefs (vdc): shutting down 00222 bcachefs (vdc): going read-only 00222 bcachefs (vdc): finished waiting for writes to stop 00223 bcachefs (vdc): flushing journal and stopping allocators, journal seq 6 00223 bcachefs (vdc): flushing journal and stopping allocators complete, journal seq 8 00223 bcachefs (vdc): clean shutdown complete, journal seq 9 00223 bcachefs (vdc): marking filesystem clean 00223 bcachefs (vdc): shutdown complete 00223 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00223 bcachefs (vdc): initializing new filesystem 00223 bcachefs (vdc): going read-write 00223 bcachefs (vdc): marking superblocks 00223 bcachefs (vdc): initializing freespace 00223 bcachefs (vdc): done initializing freespace 00223 bcachefs (vdc): reading snapshots table 00223 bcachefs (vdc): reading snapshots done 00223 bcachefs (vdc): done starting filesystem 00244 hrtimer: interrupt took 123350440 ns 00264 bcachefs (vdc): shutting down 00264 bcachefs (vdc): going read-only 00264 bcachefs (vdc): finished waiting for writes to stop 00264 bcachefs (vdc): flushing journal and stopping allocators, journal seq 97 00265 bcachefs (vdc): flushing journal and stopping allocators complete, journal seq 101 00265 bcachefs (vdc): clean shutdown complete, journal seq 102 00265 bcachefs (vdc): marking filesystem clean 00265 bcachefs (vdc): shutdown complete 00265 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00265 bcachefs (vdc): recovering from clean shutdown, journal seq 102 00265 bcachefs (vdc): accounting_read... 00265 ================================================================== 00265 done 00265 BUG: KASAN: slab-use-after-free in bch2_fs_to_text+0x12b4/0x1728 00265 bcachefs (vdc): alloc_read... done 00265 bcachefs (vdc): stripes_read... done 00265 Read of size 4 at addr ffffff80c57eac00 by task cat/7531 00265 bcachefs (vdc): snapshots_read... done 00265 00265 CPU: 6 UID: 0 PID: 7531 Comm: cat Not tainted 6.13.0-rc3-ktest-g16fc6fa3819d #14103 00265 Hardware name: linux,dummy-virt (DT) 00265 Call trace: 00265 show_stack+0x1c/0x30 (C) 00265 dump_stack_lvl+0x6c/0x80 00265 print_report+0xf8/0x5d8 00265 kasan_report+0x90/0xd0 00265 __asan_report_load4_noabort+0x1c/0x28 00265 bch2_fs_to_text+0x12b4/0x1728 00265 bch2_fs_show+0x94/0x188 00265 sysfs_kf_seq_show+0x1a4/0x348 00265 kernfs_seq_show+0x12c/0x198 00265 seq_read_iter+0x27c/0xfd0 00265 kernfs_fop_read_iter+0x390/0x4f8 00265 vfs_read+0x480/0x7f0 00265 ksys_read+0xe0/0x1e8 00265 __arm64_sys_read+0x70/0xa8 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 Allocated by task 7510: 00265 kasan_save_stack+0x28/0x50 00265 kasan_save_track+0x1c/0x38 00265 kasan_save_alloc_info+0x3c/0x50 00265 __kasan_kmalloc+0xac/0xb0 00265 __kmalloc_node_noprof+0x168/0x348 00265 __kvmalloc_node_noprof+0x20/0x140 00265 __bch2_darray_resize_noprof+0x90/0x1b0 00265 __bch2_accounting_mem_insert+0x76c/0xb08 00265 bch2_accounting_mem_insert+0x224/0x3b8 00265 bch2_accounting_mem_mod_locked+0x480/0xc58 00265 bch2_accounting_read+0xa94/0x3eb8 00265 bch2_run_recovery_pass+0x80/0x178 00265 bch2_run_recovery_passes+0x340/0x698 00265 bch2_fs_recovery+0x1c98/0x2bd8 00265 bch2_fs_start+0x240/0x490 00265 bch2_fs_get_tree+0xe1c/0x1458 00265 vfs_get_tree+0x7c/0x250 00265 path_mount+0xe24/0x1648 00265 __arm64_sys_mount+0x240/0x438 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 Freed by task 7510: 00265 kasan_save_stack+0x28/0x50 00265 kasan_save_track+0x1c/0x38 00265 kasan_save_free_info+0x48/0x88 00265 __kasan_slab_free+0x48/0x60 00265 kfree+0x188/0x408 00265 kvfree+0x3c/0x50 00265 __bch2_darray_resize_noprof+0xe0/0x1b0 00265 __bch2_accounting_mem_insert+0x76c/0xb08 00265 bch2_accounting_mem_insert+0x224/0x3b8 00265 bch2_accounting_mem_mod_locked+0x480/0xc58 00265 bch2_accounting_read+0xa94/0x3eb8 00265 bch2_run_recovery_pass+0x80/0x178 00265 bch2_run_recovery_passes+0x340/0x698 00265 bch2_fs_recovery+0x1c98/0x2bd8 00265 bch2_fs_start+0x240/0x490 00265 bch2_fs_get_tree+0xe1c/0x1458 00265 vfs_get_tree+0x7c/0x250 00265 path_mount+0xe24/0x1648 00265 bcachefs (vdc): going read-write 00265 __arm64_sys_mount+0x240/0x438 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 The buggy address belongs to the object at ffffff80c57eac00 00265 which belongs to the cache kmalloc-128 of size 128 00265 The buggy address is located 0 bytes inside of 00265 freed 128-byte region [ffffff80c57eac00, ffffff80c57eac80) 00265 00265 The buggy address belongs to the physical page: 00265 page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1057ea 00265 head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 00265 flags: 0x8000000000000040(head|zone=2) 00265 page_type: f5(slab) 00265 raw: 8000000000000040 ffffff80c0002800 dead000000000100 dead000000000122 00265 raw: 0000000000000000 0000000000200020 00000001f5000000 ffffff80c57a6400 00265 head: 8000000000000040 ffffff80c0002800 dead000000000100 dead000000000122 00265 head: 0000000000000000 0000000000200020 00000001f5000000 ffffff80c57a6400 00265 head: 8000000000000001 fffffffec315fa81 ffffffffffffffff 0000000000000000 00265 head: 0000000000000002 0000000000000000 00000000ffffffff 0000000000000000 00265 page dumped because: kasan: bad access detected 00265 00265 Memory state around the buggy address: 00265 ffffff80c57eab00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00265 ffffff80c57eab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 00265 >ffffff80c57eac00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 00265 ^ 00265 ffffff80c57eac80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 00265 ffffff80c57ead00: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc 00265 ================================================================== 00265 Kernel panic - not syncing: kasan.fault=panic set ... 00265 CPU: 6 UID: 0 PID: 7531 Comm: cat Not tainted 6.13.0-rc3-ktest-g16fc6fa3819d #14103 00265 Hardware name: linux,dummy-virt (DT) 00265 Call trace: 00265 show_stack+0x1c/0x30 (C) 00265 dump_stack_lvl+0x30/0x80 00265 dump_stack+0x18/0x20 00265 panic+0x4d4/0x518 00265 start_report.constprop.0+0x0/0x90 00265 kasan_report+0xa0/0xd0 00265 __asan_report_load4_noabort+0x1c/0x28 00265 bch2_fs_to_text+0x12b4/0x1728 00265 bch2_fs_show+0x94/0x188 00265 sysfs_kf_seq_show+0x1a4/0x348 00265 kernfs_seq_show+0x12c/0x198 00265 seq_read_iter+0x27c/0xfd0 00265 kernfs_fop_read_iter+0x390/0x4f8 00265 vfs_read+0x480/0x7f0 00265 ksys_read+0xe0/0x1e8 00265 __arm64_sys_read+0x70/0xa8 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 SMP: stopping secondary CPUs 00265 Kernel Offset: disabled 00265 CPU features: 0x000,00000070,00000010,8240500b 00265 Memory Limit: none 00265 ---[ end Kernel panic - not syncing: kasan.fault=panic set ... ]--- 00270 ========= FAILED TIMEOUT generic.187 in 1200s Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 5360cbb3ec298..f4372cafea2e9 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { + percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -- GitLab From 1e690efa72596a1163dc56709707f459221889d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 11:34:59 -0500 Subject: [PATCH 441/989] bcachefs: Split out journal pins by btree level This lets us flush the journal to go read-only more effectively. Flushing the journal and going read-only requires halting mutually recursive processes, which strictly speaking are not guaranteed to terminate. Flushing btree node journal pins will kick off a btree node write, and btree node writes on completion must do another btree update to the parent node to update the 'sectors_written' field for that node's key. If the parent node is full and requires a split or compaction, that's going to generate a whole bunch of additional btree updates - alloc info, LRU btree, and more - which then have to be flushed, and the cycle repeats. This process will terminate much more effectively if we tweak journal reclaim to flush btree updates leaf to root: i.e., don't flush updates for a given btree node (kicking off a write, and consuming space within that node up to the next block boundary) if there might still be unflushed updates in child nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 37 +++++++++++++++++------------------ fs/bcachefs/journal_types.h | 5 ++++- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 6a9cefb635d63..d373cd181a7f5 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_TYPE_btree; - else if (fn == bch2_btree_key_cache_journal_flush) + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) return JOURNAL_PIN_TYPE_key_cache; else return JOURNAL_PIN_TYPE_other; @@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_key_cache)| - BIT(JOURNAL_PIN_TYPE_other))) { - *did_work = true; - goto unlock; - } - - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_btree))) { - *did_work = true; - goto unlock; - } + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a198a81d74784..1ef3a28ed6ab3 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,7 +53,10 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, JOURNAL_PIN_TYPE_key_cache, JOURNAL_PIN_TYPE_other, JOURNAL_PIN_TYPE_NR, -- GitLab From 9f734cd076931fa4d7feb5728e5cd95cde0af114 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 17:46:36 -0500 Subject: [PATCH 442/989] bcachefs: Fix want_new_bset() so we write until the end of the btree node want_new_bset() returns the address of a new bset to initialize if we wish to do so in a btree node - either because the previous one is too big, or because it's been written. The case for 'previous bset was written' was wrong: it's only supposed to check for if we have space in the node for one more block, but because it subtracted the header from the space available it would never initialize a new bset if we were down to the last block in a node. Fixing this results in fewer btree node splits/compactions, which fixes a bug with flushing the journal to go read-only sometimes not terminating or taking excessively long. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7930ffea3075d..26d646e1275c0 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -- GitLab From 6aa8a63c471eb6756aabd03f880feffe6a7af6c9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 11 Feb 2025 15:45:16 +0100 Subject: [PATCH 443/989] USB: serial: option: drop MeiG Smart defines Several MeiG Smart modems apparently use the same product id, making the defines even less useful. Drop them in favour of using comments consistently to make the id table slightly less unwieldy. Cc: stable@vger.kernel.org Acked-by: Chester A. Unal Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 4a59a40f750a6..58bd54e8c483a 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -619,18 +619,6 @@ static void option_instat_callback(struct urb *urb); /* Luat Air72*U series based on UNISOC UIS8910 uses UNISOC's vendor ID */ #define LUAT_PRODUCT_AIR720U 0x4e00 -/* MeiG Smart Technology products */ -#define MEIGSMART_VENDOR_ID 0x2dee -/* - * MeiG Smart SLM828, SRM815, and SRM825L use the same product ID. SLM828 is - * based on Qualcomm SDX12. SRM815 and SRM825L are based on Qualcomm 315. - */ -#define MEIGSMART_PRODUCT_SRM825L 0x4d22 -/* MeiG Smart SLM320 based on UNISOC UIS8910 */ -#define MEIGSMART_PRODUCT_SLM320 0x4d41 -/* MeiG Smart SLM770A based on ASR1803 */ -#define MEIGSMART_PRODUCT_SLM770A 0x4d57 - /* Device flags */ /* Highest interface number which can be used with NCTRL() and RSVD() */ @@ -2366,6 +2354,14 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a05, 0xff) }, /* Fibocom FM650-CN (NCM mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a06, 0xff) }, /* Fibocom FM650-CN (RNDIS mode) */ { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0a07, 0xff) }, /* Fibocom FM650-CN (MBIM mode) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d41, 0xff, 0, 0) }, /* MeiG Smart SLM320 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d57, 0xff, 0, 0) }, /* MeiG Smart SLM770A */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ + { USB_DEVICE_AND_INTERFACE_INFO(0x2dee, 0x4d22, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ @@ -2422,14 +2418,6 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM320, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SLM770A, 0xff, 0, 0) }, - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0, 0) }, /* MeiG Smart SRM815 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x02) }, /* MeiG Smart SLM828 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0x10, 0x03) }, /* MeiG Smart SLM828 */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x30) }, /* MeiG Smart SRM815 and SRM825L */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x40) }, /* MeiG Smart SRM825L */ - { USB_DEVICE_AND_INTERFACE_INFO(MEIGSMART_VENDOR_ID, MEIGSMART_PRODUCT_SRM825L, 0xff, 0xff, 0x60) }, /* MeiG Smart SRM825L */ { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ .driver_info = NCTRL(1) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0640, 0xff), /* TCL IK512 ECM */ -- GitLab From 35e21de48e693af1dcfdbf2dc3d73dcfa3c8f2d9 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Tue, 11 Feb 2025 16:48:06 +0100 Subject: [PATCH 444/989] regulator: core: let dt properties override driver init_data This reverts commit cd7a38c40b231350a3cd0fd774f4e6bb68c4b411. When submitting the change above, it was thought that the origin of the init_data should be a clear choice, from the driver or from DT but not both. It turns out some devices, such as qcom-msm8974-lge-nexus5-hammerhead, relied on the old behaviour to override the init_data provided by the driver, making it some kind of default if none is provided by the platform. Using the init_data provided by the driver when it is present broke these devices so revert the change to fixup the situation and add a comment to make things a bit more clear Reported-by: Luca Weiss Closes: https://lore.kernel.org/lkml/5857103.DvuYhMxLoT@lucaweiss.eu Fixes: cd7a38c40b23 ("regulator: core: do not silently ignore provided init_data") Signed-off-by: Jerome Brunet Link: https://patch.msgid.link/20250211-regulator-init-data-fixup-v1-1-5ce1c6cff990@baylibre.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 61 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index 89578b91c4680..4ddf0efead682 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -5774,43 +5774,36 @@ regulator_register(struct device *dev, goto clean; } - if (config->init_data) { - /* - * Providing of_match means the framework is expected to parse - * DT to get the init_data. This would conflict with provided - * init_data, if set. Warn if it happens. - */ - if (regulator_desc->of_match) - dev_warn(dev, "Using provided init data - OF match ignored\n"); + /* + * DT may override the config->init_data provided if the platform + * needs to do so. If so, config->init_data is completely ignored. + */ + init_data = regulator_of_get_init_data(dev, regulator_desc, config, + &rdev->dev.of_node); + /* + * Sometimes not all resources are probed already so we need to take + * that into account. This happens most the time if the ena_gpiod comes + * from a gpio extender or something else. + */ + if (PTR_ERR(init_data) == -EPROBE_DEFER) { + ret = -EPROBE_DEFER; + goto clean; + } + + /* + * We need to keep track of any GPIO descriptor coming from the + * device tree until we have handled it over to the core. If the + * config that was passed in to this function DOES NOT contain + * a descriptor, and the config after this call DOES contain + * a descriptor, we definitely got one from parsing the device + * tree. + */ + if (!cfg->ena_gpiod && config->ena_gpiod) + dangling_of_gpiod = true; + if (!init_data) { init_data = config->init_data; rdev->dev.of_node = of_node_get(config->of_node); - - } else { - init_data = regulator_of_get_init_data(dev, regulator_desc, - config, - &rdev->dev.of_node); - - /* - * Sometimes not all resources are probed already so we need to - * take that into account. This happens most the time if the - * ena_gpiod comes from a gpio extender or something else. - */ - if (PTR_ERR(init_data) == -EPROBE_DEFER) { - ret = -EPROBE_DEFER; - goto clean; - } - - /* - * We need to keep track of any GPIO descriptor coming from the - * device tree until we have handled it over to the core. If the - * config that was passed in to this function DOES NOT contain a - * descriptor, and the config after this call DOES contain a - * descriptor, we definitely got one from parsing the device - * tree. - */ - if (!cfg->ena_gpiod && config->ena_gpiod) - dangling_of_gpiod = true; } ww_mutex_init(&rdev->mutex, ®ulator_ww_class); -- GitLab From 69ab25a74e2df53edc2de4acfce0a484bdb88155 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 10 Jan 2025 16:29:22 -0800 Subject: [PATCH 445/989] idpf: fix handling rsc packet with a single segment Handle rsc packet with a single segment same as a multi segment rsc packet so that CHECKSUM_PARTIAL is set in the skb->ip_summed field. The current code is passing CHECKSUM_NONE resulting in TCP GRO layer doing checksum in SW and hiding the issue. This will fail when using dmabufs as payload buffers as skb frag would be unreadable. Fixes: 3a8845af66ed ("idpf: add RX splitq napi poll support") Signed-off-by: Sridhar Samudrala Reviewed-by: Przemek Kitszel Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 2fa9c36e33c9c..c9fcf8f4d7363 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3008,8 +3008,6 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, return -EINVAL; rsc_segments = DIV_ROUND_UP(skb->data_len, rsc_seg_len); - if (unlikely(rsc_segments == 1)) - return 0; NAPI_GRO_CB(skb)->count = rsc_segments; skb_shinfo(skb)->gso_size = rsc_seg_len; -- GitLab From 2ff66c2f9ea4e9311e9a00004348b6c465bd5d3b Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 10 Jan 2025 16:29:58 -0800 Subject: [PATCH 446/989] idpf: record rx queue in skb for RSC packets Move the call to skb_record_rx_queue in idpf_rx_process_skb_fields() so that RX queue is recorded for RSC packets too. Fixes: 90912f9f4f2d ("idpf: convert header split mode to libeth + napi_build_skb()") Signed-off-by: Sridhar Samudrala Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index c9fcf8f4d7363..9be6a6b59c4e1 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3070,6 +3070,7 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, idpf_rx_hash(rxq, skb, rx_desc, decoded); skb->protocol = eth_type_trans(skb, rxq->netdev); + skb_record_rx_queue(skb, rxq->idx); if (le16_get_bits(rx_desc->hdrlen_flags, VIRTCHNL2_RX_FLEX_DESC_ADV_RSC_M)) @@ -3078,8 +3079,6 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, csum_bits = idpf_rx_splitq_extract_csum_bits(rx_desc); idpf_rx_csum(rxq, skb, csum_bits, decoded); - skb_record_rx_queue(skb, rxq->idx); - return 0; } -- GitLab From 52c11d31b5a1d1c747bb5f36cc4808e93e2348f4 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Tue, 4 Feb 2025 18:08:11 -0800 Subject: [PATCH 447/989] idpf: call set_real_num_queues in idpf_open On initial driver load, alloc_etherdev_mqs is called with whatever max queue values are provided by the control plane. However, if the driver is loaded on a system where num_online_cpus() returns less than the max queues, the netdev will think there are more queues than are actually available. Only num_online_cpus() will be allocated, but skb_get_queue_mapping(skb) could possibly return an index beyond the range of allocated queues. Consequently, the packet is silently dropped and it appears as if TX is broken. Set the real number of queues during open so the netdev knows how many queues will be allocated. Fixes: 1c325aac10a8 ("idpf: configure resources for TX queues") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index b4fbb99bfad20..a3d6b8f198a86 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -2159,8 +2159,13 @@ static int idpf_open(struct net_device *netdev) idpf_vport_ctrl_lock(netdev); vport = idpf_netdev_to_vport(netdev); + err = idpf_set_real_num_queues(vport); + if (err) + goto unlock; + err = idpf_vport_open(vport); +unlock: idpf_vport_ctrl_unlock(netdev); return err; -- GitLab From 61fb097f9a644407b9342a8169d0edef868612d7 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Fri, 31 Jan 2025 13:14:50 +0100 Subject: [PATCH 448/989] ixgbe: Fix possible skb NULL pointer dereference The commit c824125cbb18 ("ixgbe: Fix passing 0 to ERR_PTR in ixgbe_run_xdp()") stopped utilizing the ERR-like macros for xdp status encoding. Propagate this logic to the ixgbe_put_rx_buffer(). The commit also relaxed the skb NULL pointer check - caught by Smatch. Restore this check. Fixes: c824125cbb18 ("ixgbe: Fix passing 0 to ERR_PTR in ixgbe_run_xdp()") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/intel-wired-lan/2c7d6c31-192a-4047-bd90-9566d0e14cc0@stanley.mountain/ Acked-by: Maciej Fijalkowski Signed-off-by: Piotr Kwapulinski Reviewed-by: Simon Horman Tested-by: Saritha Sanigani (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 7236f20c9a309..467f81239e12f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2105,7 +2105,7 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring, /* hand second half of page back to the ring */ ixgbe_reuse_rx_page(rx_ring, rx_buffer); } else { - if (!IS_ERR(skb) && IXGBE_CB(skb)->dma == rx_buffer->dma) { + if (skb && IXGBE_CB(skb)->dma == rx_buffer->dma) { /* the page has been released from the ring */ IXGBE_CB(skb)->page_released = true; } else { -- GitLab From 7822dd4d6d4bebca5045a395e1784ef09cae2d43 Mon Sep 17 00:00:00 2001 From: Zdenek Bouska Date: Tue, 28 Jan 2025 13:26:48 +0100 Subject: [PATCH 449/989] igc: Fix HW RX timestamp when passed by ZC XDP Fixes HW RX timestamp in the following scenario: - AF_PACKET socket with enabled HW RX timestamps is created - AF_XDP socket with enabled zero copy is created - frame is forwarded to the BPF program, where the timestamp should still be readable (extracted by igc_xdp_rx_timestamp(), kfunc behind bpf_xdp_metadata_rx_timestamp()) - the frame got XDP_PASS from BPF program, redirecting to the stack - AF_PACKET socket receives the frame with HW RX timestamp Moves the skb timestamp setting from igc_dispatch_skb_zc() to igc_construct_skb_zc() so that igc_construct_skb_zc() is similar to igc_construct_skb(). This issue can also be reproduced by running: # tools/testing/selftests/bpf/xdp_hw_metadata enp1s0 When a frame with the wrong port 9092 (instead of 9091) is used: # echo -n xdp | nc -u -q1 192.168.10.9 9092 then the RX timestamp is missing and xdp_hw_metadata prints: skb hwtstamp is not found! With this fix or when copy mode is used: # tools/testing/selftests/bpf/xdp_hw_metadata -c enp1s0 then RX timestamp is found and xdp_hw_metadata prints: found skb hwtstamp = 1736509937.852786132 Fixes: 069b142f5819 ("igc: Add support for PTP .getcyclesx64()") Signed-off-by: Zdenek Bouska Acked-by: Vinicius Costa Gomes Reviewed-by: Simon Horman Reviewed-by: Florian Bezdeka Reviewed-by: Song Yoong Siang Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 56a35d58e7a62..21f318f12a8d6 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2701,8 +2701,9 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) } static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, - struct xdp_buff *xdp) + struct igc_xdp_buff *ctx) { + struct xdp_buff *xdp = &ctx->xdp; unsigned int totalsize = xdp->data_end - xdp->data_meta; unsigned int metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; @@ -2721,27 +2722,28 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring, __skb_pull(skb, metasize); } + if (ctx->rx_ts) { + skb_shinfo(skb)->tx_flags |= SKBTX_HW_TSTAMP_NETDEV; + skb_hwtstamps(skb)->netdev_data = ctx->rx_ts; + } + return skb; } static void igc_dispatch_skb_zc(struct igc_q_vector *q_vector, union igc_adv_rx_desc *desc, - struct xdp_buff *xdp, - ktime_t timestamp) + struct igc_xdp_buff *ctx) { struct igc_ring *ring = q_vector->rx.ring; struct sk_buff *skb; - skb = igc_construct_skb_zc(ring, xdp); + skb = igc_construct_skb_zc(ring, ctx); if (!skb) { ring->rx_stats.alloc_failed++; set_bit(IGC_RING_FLAG_RX_ALLOC_FAILED, &ring->flags); return; } - if (timestamp) - skb_hwtstamps(skb)->hwtstamp = timestamp; - if (igc_cleanup_headers(ring, desc, skb)) return; @@ -2777,7 +2779,6 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) union igc_adv_rx_desc *desc; struct igc_rx_buffer *bi; struct igc_xdp_buff *ctx; - ktime_t timestamp = 0; unsigned int size; int res; @@ -2807,6 +2808,8 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) */ bi->xdp->data_meta += IGC_TS_HDR_LEN; size -= IGC_TS_HDR_LEN; + } else { + ctx->rx_ts = NULL; } bi->xdp->data_end = bi->xdp->data + size; @@ -2815,7 +2818,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget) res = __igc_xdp_run_prog(adapter, prog, bi->xdp); switch (res) { case IGC_XDP_PASS: - igc_dispatch_skb_zc(q_vector, desc, bi->xdp, timestamp); + igc_dispatch_skb_zc(q_vector, desc, ctx); fallthrough; case IGC_XDP_CONSUMED: xsk_buff_free(bi->xdp); -- GitLab From 63f20f00d23d569e4e67859b4e8dcc9de79221cb Mon Sep 17 00:00:00 2001 From: Song Yoong Siang Date: Wed, 5 Feb 2025 10:36:03 +0800 Subject: [PATCH 450/989] igc: Set buffer type for empty frames in igc_init_empty_frame Set the buffer type to IGC_TX_BUFFER_TYPE_SKB for empty frame in the igc_init_empty_frame function. This ensures that the buffer type is correctly identified and handled during Tx ring cleanup. Fixes: db0b124f02ba ("igc: Enhance Qbv scheduling by using first flag bit") Cc: stable@vger.kernel.org # 6.2+ Signed-off-by: Song Yoong Siang Acked-by: Maciej Fijalkowski Reviewed-by: Simon Horman Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 21f318f12a8d6..84307bb7313e0 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1096,6 +1096,7 @@ static int igc_init_empty_frame(struct igc_ring *ring, return -ENOMEM; } + buffer->type = IGC_TX_BUFFER_TYPE_SKB; buffer->skb = skb; buffer->protocol = 0; buffer->bytecount = skb->len; -- GitLab From fc22b06fbd2afefa1eddff69a6fd30c539cef577 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 11 Feb 2025 09:28:39 +0200 Subject: [PATCH 451/989] platform/x86: int3472: Use correct type for "polarity", call it gpio_flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Struct gpiod_lookup flags field's type is unsigned long. Thus use unsigned long for values to be assigned to that field. Similarly, also call the field gpio_flags which it really is. Signed-off-by: Sakari Ailus Reviewed-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250211072841.7713-2-sakari.ailus@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/int3472/discrete.c | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c index 31015ebe20d89..b891b064fbf7e 100644 --- a/drivers/platform/x86/intel/int3472/discrete.c +++ b/drivers/platform/x86/intel/int3472/discrete.c @@ -55,7 +55,7 @@ static void skl_int3472_log_sensor_module_name(struct int3472_discrete_device *i static int skl_int3472_fill_gpiod_lookup(struct gpiod_lookup *table_entry, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { char *path = agpio->resource_source.string_ptr; struct acpi_device *adev; @@ -70,14 +70,14 @@ static int skl_int3472_fill_gpiod_lookup(struct gpiod_lookup *table_entry, if (!adev) return -ENODEV; - *table_entry = GPIO_LOOKUP(acpi_dev_name(adev), agpio->pin_table[0], func, polarity); + *table_entry = GPIO_LOOKUP(acpi_dev_name(adev), agpio->pin_table[0], func, gpio_flags); return 0; } static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int3472, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { int ret; @@ -87,7 +87,7 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347 } ret = skl_int3472_fill_gpiod_lookup(&int3472->gpios.table[int3472->n_sensor_gpios], - agpio, func, polarity); + agpio, func, gpio_flags); if (ret) return ret; @@ -100,7 +100,7 @@ static int skl_int3472_map_gpio_to_sensor(struct int3472_discrete_device *int347 static struct gpio_desc * skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, struct acpi_resource_gpio *agpio, - const char *func, u32 polarity) + const char *func, unsigned long gpio_flags) { struct gpio_desc *desc; int ret; @@ -111,7 +111,7 @@ skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, return ERR_PTR(-ENOMEM); lookup->dev_id = dev_name(int3472->dev); - ret = skl_int3472_fill_gpiod_lookup(&lookup->table[0], agpio, func, polarity); + ret = skl_int3472_fill_gpiod_lookup(&lookup->table[0], agpio, func, gpio_flags); if (ret) return ERR_PTR(ret); @@ -122,32 +122,33 @@ skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, return desc; } -static void int3472_get_func_and_polarity(u8 type, const char **func, u32 *polarity) +static void int3472_get_func_and_polarity(u8 type, const char **func, + unsigned long *gpio_flags) { switch (type) { case INT3472_GPIO_TYPE_RESET: *func = "reset"; - *polarity = GPIO_ACTIVE_LOW; + *gpio_flags = GPIO_ACTIVE_LOW; break; case INT3472_GPIO_TYPE_POWERDOWN: *func = "powerdown"; - *polarity = GPIO_ACTIVE_LOW; + *gpio_flags = GPIO_ACTIVE_LOW; break; case INT3472_GPIO_TYPE_CLK_ENABLE: *func = "clk-enable"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; case INT3472_GPIO_TYPE_PRIVACY_LED: *func = "privacy-led"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; case INT3472_GPIO_TYPE_POWER_ENABLE: *func = "power-enable"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; default: *func = "unknown"; - *polarity = GPIO_ACTIVE_HIGH; + *gpio_flags = GPIO_ACTIVE_HIGH; break; } } @@ -194,7 +195,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, struct gpio_desc *gpio; const char *err_msg; const char *func; - u32 polarity; + unsigned long gpio_flags; int ret; if (!acpi_gpio_get_io_resource(ares, &agpio)) @@ -217,7 +218,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, type = FIELD_GET(INT3472_GPIO_DSM_TYPE, obj->integer.value); - int3472_get_func_and_polarity(type, &func, &polarity); + int3472_get_func_and_polarity(type, &func, &gpio_flags); pin = FIELD_GET(INT3472_GPIO_DSM_PIN, obj->integer.value); /* Pin field is not really used under Windows and wraps around at 8 bits */ @@ -227,16 +228,16 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, active_value = FIELD_GET(INT3472_GPIO_DSM_SENSOR_ON_VAL, obj->integer.value); if (!active_value) - polarity ^= GPIO_ACTIVE_LOW; + gpio_flags ^= GPIO_ACTIVE_LOW; dev_dbg(int3472->dev, "%s %s pin %d active-%s\n", func, agpio->resource_source.string_ptr, agpio->pin_table[0], - str_high_low(polarity == GPIO_ACTIVE_HIGH)); + str_high_low(gpio_flags == GPIO_ACTIVE_HIGH)); switch (type) { case INT3472_GPIO_TYPE_RESET: case INT3472_GPIO_TYPE_POWERDOWN: - ret = skl_int3472_map_gpio_to_sensor(int3472, agpio, func, polarity); + ret = skl_int3472_map_gpio_to_sensor(int3472, agpio, func, gpio_flags); if (ret) err_msg = "Failed to map GPIO pin to sensor\n"; @@ -244,7 +245,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, case INT3472_GPIO_TYPE_CLK_ENABLE: case INT3472_GPIO_TYPE_PRIVACY_LED: case INT3472_GPIO_TYPE_POWER_ENABLE: - gpio = skl_int3472_gpiod_get_from_temp_lookup(int3472, agpio, func, polarity); + gpio = skl_int3472_gpiod_get_from_temp_lookup(int3472, agpio, func, gpio_flags); if (IS_ERR(gpio)) { ret = PTR_ERR(gpio); err_msg = "Failed to get GPIO\n"; -- GitLab From 569617dbbd06286fb73f3f1c2ac91e51d863c7de Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 11 Feb 2025 09:28:40 +0200 Subject: [PATCH 452/989] platform/x86: int3472: Call "reset" GPIO "enable" for INT347E MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DT bindings for ov7251 specify "enable" GPIO (xshutdown in documentation) but the int3472 indiscriminately provides this as a "reset" GPIO to sensor drivers. Take this into account by assigning it as "enable" with active high polarity for INT347E devices, i.e. ov7251. "reset" with active low polarity remains the default GPIO name for other devices. Signed-off-by: Sakari Ailus Reviewed-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250211072841.7713-3-sakari.ailus@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/int3472/discrete.c | 52 +++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/int3472/discrete.c b/drivers/platform/x86/intel/int3472/discrete.c index b891b064fbf7e..092252eb95a87 100644 --- a/drivers/platform/x86/intel/int3472/discrete.c +++ b/drivers/platform/x86/intel/int3472/discrete.c @@ -2,6 +2,7 @@ /* Author: Dan Scally */ #include +#include #include #include #include @@ -122,10 +123,53 @@ skl_int3472_gpiod_get_from_temp_lookup(struct int3472_discrete_device *int3472, return desc; } -static void int3472_get_func_and_polarity(u8 type, const char **func, - unsigned long *gpio_flags) +/** + * struct int3472_gpio_map - Map GPIOs to whatever is expected by the + * sensor driver (as in DT bindings) + * @hid: The ACPI HID of the device without the instance number e.g. INT347E + * @type_from: The GPIO type from ACPI ?SDT + * @type_to: The assigned GPIO type, typically same as @type_from + * @func: The function, e.g. "enable" + * @polarity_low: GPIO_ACTIVE_LOW true if the @polarity_low is true, + * GPIO_ACTIVE_HIGH otherwise + */ +struct int3472_gpio_map { + const char *hid; + u8 type_from; + u8 type_to; + bool polarity_low; + const char *func; +}; + +static const struct int3472_gpio_map int3472_gpio_map[] = { + { "INT347E", INT3472_GPIO_TYPE_RESET, INT3472_GPIO_TYPE_RESET, false, "enable" }, +}; + +static void int3472_get_func_and_polarity(struct acpi_device *adev, u8 *type, + const char **func, unsigned long *gpio_flags) { - switch (type) { + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(int3472_gpio_map); i++) { + /* + * Map the firmware-provided GPIO to whatever a driver expects + * (as in DT bindings). First check if the type matches with the + * GPIO map, then further check that the device _HID matches. + */ + if (*type != int3472_gpio_map[i].type_from) + continue; + + if (!acpi_dev_hid_uid_match(adev, int3472_gpio_map[i].hid, NULL)) + continue; + + *type = int3472_gpio_map[i].type_to; + *gpio_flags = int3472_gpio_map[i].polarity_low ? + GPIO_ACTIVE_LOW : GPIO_ACTIVE_HIGH; + *func = int3472_gpio_map[i].func; + return; + } + + switch (*type) { case INT3472_GPIO_TYPE_RESET: *func = "reset"; *gpio_flags = GPIO_ACTIVE_LOW; @@ -218,7 +262,7 @@ static int skl_int3472_handle_gpio_resources(struct acpi_resource *ares, type = FIELD_GET(INT3472_GPIO_DSM_TYPE, obj->integer.value); - int3472_get_func_and_polarity(type, &func, &gpio_flags); + int3472_get_func_and_polarity(int3472->sensor, &type, &func, &gpio_flags); pin = FIELD_GET(INT3472_GPIO_DSM_PIN, obj->integer.value); /* Pin field is not really used under Windows and wraps around at 8 bits */ -- GitLab From 318e8c339c9a0891c389298bb328ed0762a9935e Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 5 Feb 2025 14:04:41 +0000 Subject: [PATCH 453/989] x86/cpu/kvm: SRSO: Fix possible missing IBPB on VM-Exit In [1] the meaning of the synthetic IBPB flags has been redefined for a better separation of concerns: - ENTRY_IBPB -- issue IBPB on entry only - IBPB_ON_VMEXIT -- issue IBPB on VM-Exit only and the Retbleed mitigations have been updated to match this new semantics. Commit [2] was merged shortly before [1], and their interaction was not handled properly. This resulted in IBPB not being triggered on VM-Exit in all SRSO mitigation configs requesting an IBPB there. Specifically, an IBPB on VM-Exit is triggered only when X86_FEATURE_IBPB_ON_VMEXIT is set. However: - X86_FEATURE_IBPB_ON_VMEXIT is not set for "spec_rstack_overflow=ibpb", because before [1] having X86_FEATURE_ENTRY_IBPB was enough. Hence, an IBPB is triggered on entry but the expected IBPB on VM-exit is not. - X86_FEATURE_IBPB_ON_VMEXIT is not set also when "spec_rstack_overflow=ibpb-vmexit" if X86_FEATURE_ENTRY_IBPB is already set. That's because before [1] this was effectively redundant. Hence, e.g. a "retbleed=ibpb spec_rstack_overflow=bpb-vmexit" config mistakenly reports the machine still vulnerable to SRSO, despite an IBPB being triggered both on entry and VM-Exit, because of the Retbleed selected mitigation config. - UNTRAIN_RET_VM won't still actually do anything unless CONFIG_MITIGATION_IBPB_ENTRY is set. For "spec_rstack_overflow=ibpb", enable IBPB on both entry and VM-Exit and clear X86_FEATURE_RSB_VMEXIT which is made superfluous by X86_FEATURE_IBPB_ON_VMEXIT. This effectively makes this mitigation option similar to the one for 'retbleed=ibpb', thus re-order the code for the RETBLEED_MITIGATION_IBPB option to be less confusing by having all features enabling before the disabling of the not needed ones. For "spec_rstack_overflow=ibpb-vmexit", guard this mitigation setting with CONFIG_MITIGATION_IBPB_ENTRY to ensure UNTRAIN_RET_VM sequence is effectively compiled in. Drop instead the CONFIG_MITIGATION_SRSO guard, since none of the SRSO compile cruft is required in this configuration. Also, check only that the required microcode is present to effectively enabled the IBPB on VM-Exit. Finally, update the KConfig description for CONFIG_MITIGATION_IBPB_ENTRY to list also all SRSO config settings enabled by this guard. Fixes: 864bcaa38ee4 ("x86/cpu/kvm: Provide UNTRAIN_RET_VM") [1] Fixes: d893832d0e1e ("x86/srso: Add IBPB on VMEXIT") [2] Reported-by: Yosry Ahmed Signed-off-by: Patrick Bellasi Reviewed-by: Borislav Petkov (AMD) Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 3 ++- arch/x86/kernel/cpu/bugs.c | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87198d957e2f1..be2c311f5118d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2599,7 +2599,8 @@ config MITIGATION_IBPB_ENTRY depends on CPU_SUP_AMD && X86_64 default y help - Compile the kernel with support for the retbleed=ibpb mitigation. + Compile the kernel with support for the retbleed=ibpb and + spec_rstack_overflow={ibpb,ibpb-vmexit} mitigations. config MITIGATION_IBRS_ENTRY bool "Enable IBRS on kernel entry" diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5a505aa654899..a5d0998d76049 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1115,6 +1115,8 @@ static void __init retbleed_select_mitigation(void) case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + mitigate_smt = true; /* * IBPB on entry already obviates the need for @@ -1124,9 +1126,6 @@ static void __init retbleed_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - mitigate_smt = true; - /* * There is no need for RSB filling: entry_ibpb() ensures * all predictions, including the RSB, are invalidated, @@ -2646,6 +2645,7 @@ static void __init srso_select_mitigation(void) if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB; /* @@ -2655,6 +2655,13 @@ static void __init srso_select_mitigation(void) */ setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); @@ -2663,8 +2670,8 @@ static void __init srso_select_mitigation(void) ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { + if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; @@ -2676,8 +2683,8 @@ static void __init srso_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { - pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); - } + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + } break; default: break; -- GitLab From 8d1d1e8d3345b56d3d8a64f845962c71468cd776 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 30 Jan 2025 14:56:07 +0100 Subject: [PATCH 454/989] s390/configs: Remove CONFIG_LSM s390 defconfig does not have BPF LSM, resulting in systemd[1]: bpf-restrict-fs: BPF LSM hook not enabled in the kernel, BPF LSM not supported. with the respective kernels. The other architectures do not explicitly set it, and the default values have BPF in them, so just drop it. Reported-by: Marc Hartmayer Acked-by: Vasily Gorbik Signed-off-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/s390/configs/zfcpdump_defconfig | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index d6beec5292a00..44f01a4bc810f 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -740,7 +740,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8cfbfb10bba8c..8bcd37edd3c97 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -725,7 +725,6 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y -CONFIG_LSM="yama,loadpin,safesetid,integrity,selinux,smack,tomoyo,apparmor" CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_FIPS=y CONFIG_CRYPTO_USER=m diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index bcbaa069de96e..853b2326a171b 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -62,7 +62,6 @@ CONFIG_ZFCP=y # CONFIG_INOTIFY_USER is not set # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set -CONFIG_LSM="yama,loadpin,safesetid,integrity" # CONFIG_ZLIB_DFLTCC is not set CONFIG_XZ_DEC_MICROLZMA=y CONFIG_PRINTK_TIME=y -- GitLab From 32ae4a2992529e2c7934e422035fad1d9b0f1fb5 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Fri, 31 Jan 2025 12:02:55 +0100 Subject: [PATCH 455/989] s390/cio: Fix CHPID "configure" attribute caching In some environments, the SCLP firmware interface used to query a CHPID's configured state is not supported. On these environments, rapidly reading the corresponding sysfs attribute produces inconsistent results: $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported $ cat /sys/devices/css0/chp0.00/configure 3 This occurs for example when Linux is run as a KVM guest. The inconsistency is a result of CIO using cached results for generating the value of the "configure" attribute while failing to handle the situation where no data was returned by SCLP. Fix this by not updating the cache-expiration timestamp when SCLP returns no data. With the fix applied, the system response is consistent: $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported $ cat /sys/devices/css0/chp0.00/configure cat: /sys/devices/css0/chp0.00/configure: Operation not supported Reviewed-by: Vineeth Vijayan Reviewed-by: Eric Farman Tested-by: Eric Farman Signed-off-by: Peter Oberparleiter Signed-off-by: Vasily Gorbik --- drivers/s390/cio/chp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 4a0b3f19bd8ef..4f01b1929240e 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -695,7 +695,8 @@ static int info_update(void) if (time_after(jiffies, chp_info_expires)) { /* Data is too old, update. */ rc = sclp_chp_read_info(&chp_info); - chp_info_expires = jiffies + CHP_INFO_UPDATE_INTERVAL ; + if (!rc) + chp_info_expires = jiffies + CHP_INFO_UPDATE_INTERVAL; } mutex_unlock(&info_lock); -- GitLab From 6166caf3bbe2429e4fac71b77e1c8254f2690383 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 10 Feb 2025 09:56:53 +0100 Subject: [PATCH 456/989] s390/bitops: Disable arch_test_bit() optimization for PROFILE_ALL_BRANCHES With PROFILE_ALL_BRANCHES enabled gcc sometimes fails to handle __builtin_constant_p() correctly: In function 'arch_test_bit', inlined from 'node_state' at include/linux/nodemask.h:423:9, inlined from 'warn_if_node_offline' at include/linux/gfp.h:252:2, inlined from '__alloc_pages_node_noprof' at include/linux/gfp.h:267:2, inlined from 'alloc_pages_node_noprof' at include/linux/gfp.h:296:9, inlined from 'vm_area_alloc_pages.constprop' at mm/vmalloc.c:3591:11: >> arch/s390/include/asm/bitops.h:60:17: warning: 'asm' operand 2 probably does not match constraints 60 | asm volatile( | ^~~ >> arch/s390/include/asm/bitops.h:60:17: error: impossible constraint in 'asm' Therefore disable the optimization for this case. This is similar to commit 63678eecec57 ("s390/preempt: disable __preempt_count_add() optimization for PROFILE_ALL_BRANCHES") Fixes: b2bc1b1a77c0 ("s390/bitops: Provide optimized arch_test_bit()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502091912.xL2xTCGw-lkp@intel.com/ Acked-by: Vasily Gorbik Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/bitops.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d5125296ade25..a5ca0a9476916 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -53,7 +53,11 @@ static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsig unsigned long mask; int cc; - if (__builtin_constant_p(nr)) { + /* + * With CONFIG_PROFILE_ALL_BRANCHES enabled gcc fails to + * handle __builtin_constant_p() in some cases. + */ + if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && __builtin_constant_p(nr)) { addr = (const volatile unsigned char *)ptr; addr += (nr ^ (BITS_PER_LONG - BITS_PER_BYTE)) / BITS_PER_BYTE; mask = 1UL << (nr & (BITS_PER_BYTE - 1)); -- GitLab From 05793884a1f30509e477de9da233ab73584b1c8c Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Feb 2025 13:30:16 +0100 Subject: [PATCH 457/989] s390/pci: Pull search for parent PF out of zpci_iov_setup_virtfn() This creates a new zpci_iov_find_parent_pf() function which a future commit can use to find if a VF has a configured parent PF. Use zdev->rid instead of zdev->devfn such that the new function can be used before it has been decided if the RID will be exposed and zdev->devfn is set. Also handle the hypotheical case that the RID is not available but there is an otherwise matching zbus. Fixes: 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs") Cc: stable@vger.kernel.org Reviewed-by: Halil Pasic Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_iov.c | 56 ++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index ead062bf2b41c..c7fdf5e79b3cc 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -60,18 +60,35 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in return 0; } -int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +/** + * zpci_iov_find_parent_pf - Find the parent PF, if any, of the given function + * @zbus: The bus that the PCI function is on, or would be added on + * @zdev: The PCI function + * + * Finds the parent PF, if it exists and is configured, of the given PCI function + * and increments its refcount. Th PF is searched for on the provided bus so the + * caller has to ensure that this is the correct bus to search. This function may + * be used before adding the PCI function to a zbus. + * + * Return: Pointer to the struct pci_dev of the parent PF or NULL if it not + * found. If the function is not a VF or has no RequesterID information, + * NULL is returned as well. + */ +static struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { - int i, cand_devfn; - struct zpci_dev *zdev; + int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; - int vfid = vfn - 1; /* Linux' vfid's start at 0 vfn at 1*/ - int rc = 0; if (!zbus->multifunction) - return 0; - - /* If the parent PF for the given VF is also configured in the + return NULL; + /* Non-VFs and VFs without RID available don't have a parent */ + if (!zdev->vfn || !zdev->rid_available) + return NULL; + /* Linux vfid starts at 0 vfn at 1 */ + vfid = zdev->vfn - 1; + devfn = zdev->rid & ZPCI_RID_MASK_DEVFN; + /* + * If the parent PF for the given VF is also configured in the * instance, it must be on the same zbus. * We can then identify the parent PF by checking what * devfn the VF would have if it belonged to that PF using the PF's @@ -85,15 +102,26 @@ int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn if (!pdev) continue; cand_devfn = pci_iov_virtfn_devfn(pdev, vfid); - if (cand_devfn == virtfn->devfn) { - rc = zpci_iov_link_virtfn(pdev, virtfn, vfid); - /* balance pci_get_slot() */ - pci_dev_put(pdev); - break; - } + if (cand_devfn == devfn) + return pdev; /* balance pci_get_slot() */ pci_dev_put(pdev); } } + return NULL; +} + +int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn) +{ + struct zpci_dev *zdev = to_zpci(virtfn); + struct pci_dev *pdev_pf; + int rc = 0; + + pdev_pf = zpci_iov_find_parent_pf(zbus, zdev); + if (pdev_pf) { + /* Linux' vfids start at 0 while zdev->vfn starts at 1 */ + rc = zpci_iov_link_virtfn(pdev_pf, virtfn, zdev->vfn - 1); + pci_dev_put(pdev_pf); + } return rc; } -- GitLab From 2844ddbd540fc84d7571cca65d6c43088e4d6952 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 7 Feb 2025 13:30:17 +0100 Subject: [PATCH 458/989] s390/pci: Fix handling of isolated VFs In contrast to the commit message of the fixed commit VFs whose parent PF is not configured are not always isolated, that is put on their own PCI domain. This is because for VFs to be added to an existing PCI domain it is enough for that PCI domain to share the same topology ID or PCHID. Such a matching PCI domain without a parent PF may exist when a PF from the same PCI card created the domain with the VF being a child of a different, non accessible, PF. While not causing technical issues it makes the rules which VFs are isolated inconsistent. Fix this by explicitly checking that the parent PF exists on the PCI domain determined by the topology ID or PCHID before registering the VF. This works because a parent PF which is under control of this Linux instance must be enabled and configured at the point where its child VFs appear because otherwise SR-IOV could not have been enabled on the parent. Fixes: 25f39d3dcb48 ("s390/pci: Ignore RID for isolated VFs") Cc: stable@vger.kernel.org Reviewed-by: Halil Pasic Signed-off-by: Niklas Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/pci/pci_bus.c | 20 ++++++++++++++++++++ arch/s390/pci/pci_iov.c | 2 +- arch/s390/pci/pci_iov.h | 7 +++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c index 857afbc4828f0..39a481ec4a402 100644 --- a/arch/s390/pci/pci_bus.c +++ b/arch/s390/pci/pci_bus.c @@ -331,6 +331,17 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev) return rc; } +static bool zpci_bus_is_isolated_vf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + struct pci_dev *pdev; + + pdev = zpci_iov_find_parent_pf(zbus, zdev); + if (!pdev) + return true; + pci_dev_put(pdev); + return false; +} + int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) { bool topo_is_tid = zdev->tid_avail; @@ -345,6 +356,15 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops) topo = topo_is_tid ? zdev->tid : zdev->pchid; zbus = zpci_bus_get(topo, topo_is_tid); + /* + * An isolated VF gets its own domain/bus even if there exists + * a matching domain/bus already + */ + if (zbus && zpci_bus_is_isolated_vf(zbus, zdev)) { + zpci_bus_put(zbus); + zbus = NULL; + } + if (!zbus) { zbus = zpci_bus_alloc(topo, topo_is_tid); if (!zbus) diff --git a/arch/s390/pci/pci_iov.c b/arch/s390/pci/pci_iov.c index c7fdf5e79b3cc..191e56a623f62 100644 --- a/arch/s390/pci/pci_iov.c +++ b/arch/s390/pci/pci_iov.c @@ -74,7 +74,7 @@ static int zpci_iov_link_virtfn(struct pci_dev *pdev, struct pci_dev *virtfn, in * found. If the function is not a VF or has no RequesterID information, * NULL is returned as well. */ -static struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) { int i, vfid, devfn, cand_devfn; struct pci_dev *pdev; diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h index e3fa4e77fc867..d2c2793eb0f34 100644 --- a/arch/s390/pci/pci_iov.h +++ b/arch/s390/pci/pci_iov.h @@ -19,6 +19,8 @@ void zpci_iov_map_resources(struct pci_dev *pdev); int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *virtfn, int vfn); +struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev); + #else /* CONFIG_PCI_IOV */ static inline void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn) {} @@ -28,5 +30,10 @@ static inline int zpci_iov_setup_virtfn(struct zpci_bus *zbus, struct pci_dev *v { return 0; } + +static inline struct pci_dev *zpci_iov_find_parent_pf(struct zpci_bus *zbus, struct zpci_dev *zdev) +{ + return NULL; +} #endif /* CONFIG_PCI_IOV */ #endif /* __S390_PCI_IOV_h */ -- GitLab From c195b9c6ab9c383d7aa3f4a65879b3ca90cb378b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sat, 8 Feb 2025 15:49:07 +0800 Subject: [PATCH 459/989] thermal/netlink: Prevent userspace segmentation fault by adjusting UAPI header The intel-lpmd tool [1], which uses the THERMAL_GENL_ATTR_CPU_CAPABILITY attribute to receive HFI events from kernel space, encounters a segmentation fault after commit 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds"). The issue arises because the THERMAL_GENL_ATTR_CPU_CAPABILITY raw value was changed while intel_lpmd still uses the old value. Although intel_lpmd can be updated to check the THERMAL_GENL_VERSION and use the appropriate THERMAL_GENL_ATTR_CPU_CAPABILITY value, the commit itself is questionable. The commit introduced a new element in the middle of enum thermal_genl_attr, which affects many existing attributes and introduces potential risks and unnecessary maintenance burdens for userspace thermal netlink event users. Solve the issue by moving the newly introduced THERMAL_GENL_ATTR_TZ_PREV_TEMP attribute to the end of the enum thermal_genl_attr. This ensures that all existing thermal generic netlink attributes remain unaffected. Link: https://github.com/intel/intel-lpmd [1] Fixes: 1773572863c4 ("thermal: netlink: Add the commands and the events for the thresholds") Signed-off-by: Zhang Rui Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/20250208074907.5679-1-rui.zhang@intel.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index 349718c271ebf..46a2633d33aaa 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -30,7 +30,6 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_TZ, THERMAL_GENL_ATTR_TZ_ID, THERMAL_GENL_ATTR_TZ_TEMP, - THERMAL_GENL_ATTR_TZ_PREV_TEMP, THERMAL_GENL_ATTR_TZ_TRIP, THERMAL_GENL_ATTR_TZ_TRIP_ID, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, @@ -54,6 +53,7 @@ enum thermal_genl_attr { THERMAL_GENL_ATTR_THRESHOLD, THERMAL_GENL_ATTR_THRESHOLD_TEMP, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, + THERMAL_GENL_ATTR_TZ_PREV_TEMP, __THERMAL_GENL_ATTR_MAX, }; #define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) -- GitLab From a6768c4f92e152265590371975d44c071a5279c7 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 11 Feb 2025 09:47:11 +0100 Subject: [PATCH 460/989] thermal/cpufreq_cooling: Remove structure member documentation The structure member documentation refers to a member which does not exist any more. Remove it. Link: https://lore.kernel.org/all/202501220046.h3PMBCti-lkp@intel.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501220046.h3PMBCti-lkp@intel.com/ Signed-off-by: Daniel Lezcano Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250211084712.2746705-1-daniel.lezcano@linaro.org [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/cpufreq_cooling.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 280071be30b15..6b7ab1814c12d 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -57,8 +57,6 @@ struct time_in_idle { * @max_level: maximum cooling level. One less than total number of valid * cpufreq frequencies. * @em: Reference on the Energy Model of the device - * @cdev: thermal_cooling_device pointer to keep track of the - * registered cooling device. * @policy: cpufreq policy. * @cooling_ops: cpufreq callbacks to thermal cooling device ops * @idle_time: idle time stats -- GitLab From 7d1163fc08936fcb5cf5d9daf366c322c3b4e882 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 7 Feb 2025 15:39:01 +0100 Subject: [PATCH 461/989] arm64: dts: rockchip: disable IOMMU when running rk3588 in PCIe endpoint mode Commit da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") enabled the mmu600_pcie IOMMU, both in the normal case (when all PCIe controllers are running in Root Complex mode) and in the case when running the pcie3x4 PCIe controller in Endpoint mode. There have been no issues detected when running the PCIe controllers in Root Complex mode. During PCI probe time, we will add a SID to the IOMMU for each PCI device enumerated on the bus, including the root port itself. However, when running the pcie3x4 PCIe controller in Endpoint mode, we will only add a single SID to the IOMMU (the SID specified in the iommus DT property). The enablement of IOMMU in endpoint mode was verified on setup with two Rock 5b:s, where the BDF of the Root Complex has BDF (00:00.0). A Root Complex sending a TLP to the Endpoint will have Requester ID set to the BDF of the initiator. On the EP side, the Requester ID will then be used as the SID. This works fine if the Root Complex has a BDF that matches the iommus DT property, however, if the Root Complex has any other BDF, we will see something like: arm-smmu-v3 fc900000.iommu: event: C_BAD_STREAMID client: (unassigned sid) sid: 0x1600 ssid: 0x0 on the endpoint side. For PCIe controllers running in endpoint mode that always uses the incoming Requester ID as the SID, the iommus DT property simply isn't a viable solution. (Neither is iommu-map a viable solution, as there is no enumeration done on the endpoint side.) Thus, partly revert commit da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") by disabling the PCI IOMMU when running the pcie3x4 PCIe controller in Endpoint mode. Since the PCI IOMMU is working as expected in the normal case, keep it enabled when running all PCIe controllers in Root Complex mode. Fixes: da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") Signed-off-by: Niklas Cassel Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20250207143900.2047949-2-cassel@kernel.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi | 1 - arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi index 4a950907ea6f5..840b638af1c24 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi @@ -213,7 +213,6 @@ pcie3x4_ep: pcie-ep@fe150000 { interrupt-names = "sys", "pmc", "msg", "legacy", "err", "dma0", "dma1", "dma2", "dma3"; max-link-speed = <3>; - iommus = <&mmu600_pcie 0x0000>; num-lanes = <4>; phys = <&pcie30phy>; phy-names = "pcie-phy"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso index 672d748fcc67c..f229cb49da680 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso @@ -23,3 +23,7 @@ &pcie3x4_ep { vpcie3v3-supply = <&vcc3v3_pcie30>; status = "okay"; }; + +&mmu600_pcie { + status = "disabled"; +}; -- GitLab From 8546cfd08aa4b982acd2357403a1f15495d622ec Mon Sep 17 00:00:00 2001 From: Patrick Wildt Date: Mon, 10 Feb 2025 22:37:29 +0100 Subject: [PATCH 462/989] arm64: dts: rockchip: adjust SMMU interrupt type on rk3588 The SMMU architecture requires wired interrupts to be edge triggered, which does not align with the DT description for the RK3588. This leads to interrupt storms, as the SMMU continues to hold the pin high and only pulls it down for a short amount when issuing an IRQ. Update the DT description to be in line with the spec and perceived reality. Signed-off-by: Patrick Wildt Fixes: cd81d3a0695c ("arm64: dts: rockchip: add rk3588 pcie and php IOMMUs") Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/Z6pxme2Chmf3d3uK@windev.fritz.box Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-base.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index 978de506d4348..c3abdfb04f8f4 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -549,10 +549,10 @@ usb_host2_xhci: usb@fcd00000 { mmu600_pcie: iommu@fc900000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfc900000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; }; @@ -560,10 +560,10 @@ mmu600_pcie: iommu@fc900000 { mmu600_php: iommu@fcb00000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfcb00000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; status = "disabled"; -- GitLab From acc18e1c1d8c0d59d793cf87790ccfcafb1bf5f0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 4 Feb 2025 11:02:32 +0000 Subject: [PATCH 463/989] btrfs: fix stale page cache after race between readahead and direct IO write After commit ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read") we can now trigger a race between a task doing a direct IO write and readahead. When this race is triggered it results in tasks getting stale data when they attempt do a buffered read (including the task that did the direct IO write). This race can be sporadically triggered with test case generic/418, failing like this: $ ./check generic/418 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian0 6.13.0-rc7-btrfs-next-185+ #17 SMP PREEMPT_DYNAMIC Mon Feb 3 12:28:46 WET 2025 MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 generic/418 14s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad) --- tests/generic/418.out 2020-06-10 19:29:03.850519863 +0100 +++ /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad 2025-02-03 15:42:36.974609476 +0000 @@ -1,2 +1,5 @@ QA output created by 418 +cmpbuf: offset 0: Expected: 0x1, got 0x0 +[6:0] FAIL - comparison failed, offset 24576 +diotest -wp -b 4096 -n 8 -i 4 failed at loop 3 Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/418.out /home/fdmanana/git/hub/xfstests/results//generic/418.out.bad' to see the entire diff) Ran: generic/418 Failures: generic/418 Failed 1 of 1 tests The race happens like this: 1) A file has a prealloc extent for the range [16K, 28K); 2) Task A starts a direct IO write against file range [24K, 28K). At the start of the direct IO write it invalidates the page cache at __iomap_dio_rw() with kiocb_invalidate_pages() for the 4K page at file offset 24K; 3) Task A enters btrfs_dio_iomap_begin() and locks the extent range [24K, 28K); 4) Task B starts a readahead for file range [16K, 28K), entering btrfs_readahead(). First it attempts to read the page at offset 16K by entering btrfs_do_readpage(), where it calls get_extent_map(), locks the range [16K, 20K) and gets the extent map for the range [16K, 28K), caching it into the 'em_cached' variable declared in the local stack of btrfs_readahead(), and then unlocks the range [16K, 20K). Since the extent map has the prealloc flag, at btrfs_do_readpage() we zero out the page's content and don't submit any bio to read the page from the extent. Then it attempts to read the page at offset 20K entering btrfs_do_readpage() where we reuse the previously cached extent map (decided by get_extent_map()) since it spans the page's range and it's still in the inode's extent map tree. Just like for the previous page, we zero out the page's content since the extent map has the prealloc flag set. Then it attempts to read the page at offset 24K entering btrfs_do_readpage() where we reuse the previously cached extent map (decided by get_extent_map()) since it spans the page's range and it's still in the inode's extent map tree. Just like for the previous pages, we zero out the page's content since the extent map has the prealloc flag set. Note that we didn't lock the extent range [24K, 28K), so we didn't synchronize with the ongoing direct IO write being performed by task A; 5) Task A enters btrfs_create_dio_extent() and creates an ordered extent for the range [24K, 28K), with the flags BTRFS_ORDERED_DIRECT and BTRFS_ORDERED_PREALLOC set; 6) Task A unlocks the range [24K, 28K) at btrfs_dio_iomap_begin(); 7) The ordered extent enters btrfs_finish_one_ordered() and locks the range [24K, 28K); 8) Task A enters fs/iomap/direct-io.c:iomap_dio_complete() and it tries to invalidate the page at offset 24K by calling kiocb_invalidate_post_direct_write(), resulting in a call chain that ends up at btrfs_release_folio(). The btrfs_release_folio() call ends up returning false because the range for the page at file offset 24K is currently locked by the task doing the ordered extent completion in the previous step (7), so we have: btrfs_release_folio() -> __btrfs_release_folio() -> try_release_extent_mapping() -> try_release_extent_state() This last function checking that the range is locked and returning false and propagating it up to btrfs_release_folio(). So this results in a failure to invalidate the page and kiocb_invalidate_post_direct_write() triggers this message logged in dmesg: Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O! After this we leave the page cache with stale data for the file range [24K, 28K), filled with zeroes instead of the data written by direct IO write (all bytes with a 0x01 value), so any task attempting to read with buffered IO, including the task that did the direct IO write, will get all bytes in the range with a 0x00 value instead of the written data. Fix this by locking the range, with btrfs_lock_and_flush_ordered_range(), at the two callers of btrfs_do_readpage() instead of doing it at get_extent_map(), just like we did before commit ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read"), and unlocking the range after all the calls to btrfs_do_readpage(). This way we never reuse a cached extent map without flushing any pending ordered extents from a concurrent direct IO write. Fixes: ac325fc2aad5 ("btrfs: do not hold the extent lock for entire read") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6f64ee16744d6..d8ded597edada 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -898,7 +898,6 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; - struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -914,14 +913,12 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, *em_cached = NULL; } - btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state); em = btrfs_get_extent(inode, folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } - unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state); return em; } @@ -1078,11 +1075,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { + struct btrfs_inode *inode = folio_to_inode(folio); + const u64 start = folio_pos(folio); + const u64 end = start + folio_size(folio) - 1; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); + free_extent_map(em_cached); /* @@ -2379,12 +2383,20 @@ void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct folio *folio; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + const u64 start = readahead_pos(rac); + const u64 end = start + readahead_length(rac) - 1; + struct extent_state *cached_state = NULL; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; + btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); + unlock_extent(&inode->io_tree, start, end, &cached_state); + if (em_cached) free_extent_map(em_cached); submit_one_bio(&bio_ctrl); -- GitLab From da2dccd7451de62b175fb8f0808d644959e964c7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Feb 2025 17:36:48 +0000 Subject: [PATCH 464/989] btrfs: fix hole expansion when writing at an offset beyond EOF At btrfs_write_check() if our file's i_size is not sector size aligned and we have a write that starts at an offset larger than the i_size that falls within the same page of the i_size, then we end up not zeroing the file range [i_size, write_offset). The code is this: start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) return ret; } So if our file's i_size is 90269 bytes and a write at offset 90365 bytes comes in, we get 'start_pos' set to 90112 bytes, which is less than the i_size and therefore we don't zero out the range [90269, 90365) by calling btrfs_cont_expand(). This is an old bug introduced in commit 9036c10208e1 ("Btrfs: update hole handling v2"), from 2008, and the buggy code got moved around over the years. Fix this by discarding 'start_pos' and comparing against the write offset ('pos') without any alignment. This bug was recently exposed by test case generic/363 which tests this scenario by polluting ranges beyond EOF with an mmap write and than verify that after a file increases we get zeroes for the range which is supposed to be a hole and not what we wrote with the previous mmaped write. We're only seeing this exposed now because generic/363 used to run only on xfs until last Sunday's fstests update. The test was failing like this: $ ./check generic/363 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian0 6.13.0-rc7-btrfs-next-185+ #17 SMP PREEMPT_DYNAMIC Mon Feb 3 12:28:46 WET 2025 MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 generic/363 0s ... [failed, exit status 1]- output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad) --- tests/generic/363.out 2025-02-05 15:31:14.013646509 +0000 +++ /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad 2025-02-05 17:25:33.112630781 +0000 @@ -1 +1,46 @@ QA output created by 363 +READ BAD DATA: offset = 0xdcad, size = 0xd921, fname = /home/fdmanana/btrfs-tests/dev/junk +OFFSET GOOD BAD RANGE +0x1609d 0x0000 0x3104 0x0 +operation# (mod 256) for the bad data may be 4 +0x1609e 0x0000 0x0472 0x1 +operation# (mod 256) for the bad data may be 4 ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/363.out /home/fdmanana/git/hub/xfstests/results//generic/363.out.bad' to see the entire diff) Ran: generic/363 Failures: generic/363 Failed 1 of 1 tests Fixes: 9036c10208e1 ("Btrfs: update hole handling v2") CC: stable@vger.kernel.org Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36f51c311bb12..ed3c0d6546c5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1039,7 +1039,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; - loff_t start_pos; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or @@ -1066,9 +1065,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) inode_inc_iversion(inode); } - start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); - if (start_pos > oldsize) { + if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); -- GitLab From 5805402dcc56241987bca674a1b4da79a249bab7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Feb 2025 10:52:42 +0000 Subject: [PATCH 465/989] vxlan: check vxlan_vnigroup_init() return value vxlan_init() must check vxlan_vnigroup_init() success otherwise a crash happens later, spotted by syzbot. Oops: general protection fault, probably for non-canonical address 0xdffffc000000002c: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000160-0x0000000000000167] CPU: 0 UID: 0 PID: 7313 Comm: syz-executor147 Not tainted 6.14.0-rc1-syzkaller-00276-g69b54314c975 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:vxlan_vnigroup_uninit+0x89/0x500 drivers/net/vxlan/vxlan_vnifilter.c:912 Code: 00 48 8b 44 24 08 4c 8b b0 98 41 00 00 49 8d 86 60 01 00 00 48 89 c2 48 89 44 24 10 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 4d 04 00 00 49 8b 86 60 01 00 00 48 ba 00 00 00 RSP: 0018:ffffc9000cc1eea8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000001 RCX: ffffffff8672effb RDX: 000000000000002c RSI: ffffffff8672ecb9 RDI: ffff8880461b4f18 RBP: ffff8880461b4ef4 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000020000 R13: ffff8880461b0d80 R14: 0000000000000000 R15: dffffc0000000000 FS: 00007fecfa95d6c0(0000) GS:ffff88806a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fecfa95cfb8 CR3: 000000004472c000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: vxlan_uninit+0x1ab/0x200 drivers/net/vxlan/vxlan_core.c:2942 unregister_netdevice_many_notify+0x12d6/0x1f30 net/core/dev.c:11824 unregister_netdevice_many net/core/dev.c:11866 [inline] unregister_netdevice_queue+0x307/0x3f0 net/core/dev.c:11736 register_netdevice+0x1829/0x1eb0 net/core/dev.c:10901 __vxlan_dev_create+0x7c6/0xa30 drivers/net/vxlan/vxlan_core.c:3981 vxlan_newlink+0xd1/0x130 drivers/net/vxlan/vxlan_core.c:4407 rtnl_newlink_create net/core/rtnetlink.c:3795 [inline] __rtnl_newlink net/core/rtnetlink.c:3906 [inline] Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Reported-by: syzbot+6a9624592218c2c5e7aa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67a9d9b4.050a0220.110943.002d.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Roopa Prabhu Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250210105242.883482-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 05c10acb2a57e..92516189e792f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2898,8 +2898,11 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; - if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) - vxlan_vnigroup_init(vxlan); + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { + err = vxlan_vnigroup_init(vxlan); + if (err) + return err; + } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) -- GitLab From 1942b1c6f687b9d1efc93f35239f185a84900e93 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 8 Feb 2025 11:52:23 +0000 Subject: [PATCH 466/989] net: phylink: make configuring clock-stop dependent on MAC support We should not be configuring the PHYs clock-stop settings unless the MAC supports phylink managed EEE. Make this dependent on MAC support. This was noticed in a suspicious RCU usage report from the kernel test robot (the suspicious RCU usage due to calling phy_detach() remains unaddressed, but is triggered by the error this was generating.) Fixes: 03abf2a7c654 ("net: phylink: add EEE management") Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tgjNn-003q0w-Pw@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 214b62fba991e..b00a315de0601 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -2265,12 +2265,15 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy, /* Allow the MAC to stop its clock if the PHY has the capability */ pl->mac_tx_clk_stop = phy_eee_tx_clock_stop_capable(phy) > 0; - /* Explicitly configure whether the PHY is allowed to stop it's - * receive clock. - */ - ret = phy_eee_rx_clock_stop(phy, pl->config->eee_rx_clk_stop_enable); - if (ret == -EOPNOTSUPP) - ret = 0; + if (pl->mac_supports_eee_ops) { + /* Explicitly configure whether the PHY is allowed to stop it's + * receive clock. + */ + ret = phy_eee_rx_clock_stop(phy, + pl->config->eee_rx_clk_stop_enable); + if (ret == -EOPNOTSUPP) + ret = 0; + } return ret; } -- GitLab From f1bf10d7e909fe898a112f5cae1e97ce34d6484d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 11 Feb 2025 10:00:25 +0000 Subject: [PATCH 467/989] cifs: pick channels for individual subrequests The netfs library could break down a read request into multiple subrequests. When multichannel is used, there is potential to improve performance when each of these subrequests pick a different channel. Today we call cifs_pick_channel when the main read request is initialized in cifs_init_request. This change moves this to cifs_prepare_read, which is the right place to pick channel since it gets called for each subrequest. Interestingly cifs_prepare_write already does channel selection for individual subreq, but looks like it was missed for read. This is especially important when multichannel is used with increased rasize. In my test setup, with rasize set to 8MB, a sequential read of large file was taking 11.5s without this change. With the change, it completed in 9s. The difference is even more signigicant with bigger rasize. Cc: Cc: David Howells Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 - fs/smb/client/file.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index ac1f890a0d543..4bdd6a43e5215 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1508,7 +1508,6 @@ struct cifs_io_parms { struct cifs_io_request { struct netfs_io_request rreq; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server; pid_t pid; }; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 79de2f2f9c41a..8582cf61242c6 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -147,7 +147,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); size_t size; int rc = 0; @@ -156,6 +156,8 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) rdata->xid = get_xid(); rdata->have_xid = true; } + + server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -198,7 +200,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); - struct TCP_Server_Info *server = req->server; + struct TCP_Server_Info *server = rdata->server; int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", @@ -266,7 +268,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) open_file = file->private_data; rreq->netfs_priv = file->private_data; req->cfile = cifsFileInfo_get(open_file); - req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) req->pid = req->cfile->pid; } else if (rreq->origin != NETFS_WRITEBACK) { -- GitLab From 06ea2c9c4163b8a8fde890a9e21d1059f22bb76d Mon Sep 17 00:00:00 2001 From: David Howells Date: Sun, 9 Feb 2025 20:07:55 +0000 Subject: [PATCH 468/989] rxrpc: Fix alteration of headers whilst zerocopy pending rxrpc: Fix alteration of headers whilst zerocopy pending AF_RXRPC now uses MSG_SPLICE_PAGES to do zerocopy of the DATA packets when it transmits them, but to reduce the number of descriptors required in the DMA ring, it allocates a space for the protocol header in the memory immediately before the data content so that it can include both in a single descriptor. This is used for either the main RX header or the smaller jumbo subpacket header as appropriate: +----+------+ | RX | | +-+--+DATA | |JH| | +--+------+ Now, when it stitches a large jumbo packet together from a number of individual DATA packets (each of which is 1412 bytes of data), it uses the full RX header from the first and then the jumbo subpacket header for the rest of the components: +---+--+------+--+------+--+------+--+------+--+------+--+------+ |UDP|RX|DATA |JH|DATA |JH|DATA |JH|DATA |JH|DATA |JH|DATA | +---+--+------+--+------+--+------+--+------+--+------+--+------+ As mentioned, the main RX header and the jumbo header overlay one another in memory and the formats don't match, so switching from one to the other means rearranging the fields and adjusting the flags. However, now that TLP has been included, it wants to retransmit the last subpacket as a new data packet on its own, which means switching between the header formats... and if the transmission is still pending, because of the MSG_SPLICE_PAGES, we end up corrupting the jumbo subheader. This has a variety of effects, with the RX service number overwriting the jumbo checksum/key number field and the RX checksum overwriting the jumbo flags - resulting in, at the very least, a confused connection-level abort from the peer. Fix this by leaving the jumbo header in the allocation with the data, but allocating the RX header from the page frag allocator and concocting it on the fly at the point of transmission as it does for ACK packets. Fixes: 7c482665931b ("rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985]") Signed-off-by: David Howells cc: Marc Dionne cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/2181712.1739131675@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 7 +++--- net/rxrpc/output.c | 50 ++++++++++++++++++++++++++++------------- net/rxrpc/rxkad.c | 13 +++++------ net/rxrpc/sendmsg.c | 4 +--- net/rxrpc/txbuf.c | 37 +++++++++--------------------- 5 files changed, 54 insertions(+), 57 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f251845fe532c..5e740c4862034 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -327,8 +327,8 @@ struct rxrpc_local { * packet with a maximum set of jumbo subpackets or a PING ACK padded * out to 64K with zeropages for PMTUD. */ - struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? - RXRPC_MAX_NR_JUMBO : 3 + 16]; + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -874,8 +874,7 @@ struct rxrpc_txbuf { #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ bool jumboable; /* Can be non-terminal jumbo subpacket */ - u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[1]; + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 6f7a125d6e908..95905b85a8d71 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -428,13 +428,13 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_send_data_req *req, struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, rxrpc_serial_t serial, int subpkt) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct kvec *kv = &call->local->kvec[subpkt]; + struct kvec *kv = &call->local->kvec[1 + subpkt]; size_t len = txb->pkt_len; bool last; u8 flags; @@ -491,18 +491,15 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, } dont_set_request_ack: - /* The jumbo header overlays the wire header in the txbuf. */ + /* There's a jumbo header prepended to the data if we need it. */ if (subpkt < req->n - 1) flags |= RXRPC_JUMBO_PACKET; else flags &= ~RXRPC_JUMBO_PACKET; if (subpkt == 0) { whdr->flags = flags; - whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; - whdr->serviceId = htons(conn->service_id); - kv->iov_base = whdr; - len += sizeof(*whdr); + kv->iov_base = txb->data; } else { jumbo->flags = flags; jumbo->pad = 0; @@ -535,7 +532,9 @@ static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, /* * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) { struct rxrpc_txqueue *tq = req->tq; rxrpc_serial_t serial; @@ -549,6 +548,18 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serials(call->conn, req->n); + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + call->tx_last_serial = serial + req->n - 1; call->tx_last_sent = req->now; xmit_ts = rxrpc_prepare_txqueue(tq, req); @@ -576,7 +587,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se if (i + 1 == req->n) /* Only sample the last subpacket in a jumbo. */ __set_bit(ix, &tq->rtt_samples); - len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); serial++; seq++; i++; @@ -618,6 +629,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); return len; } @@ -626,25 +638,33 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se */ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { + struct rxrpc_wire_header *whdr; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; struct rxrpc_txqueue *tq = req->tq; struct rxrpc_txbuf *txb; struct msghdr msg; rxrpc_seq_t seq = req->seq; - size_t len; + size_t len = sizeof(*whdr); bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); int ret, stat_ix; _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ + + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - len = rxrpc_prepare_data_packet(call, req); + len += rxrpc_prepare_data_packet(call, req, whdr); txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -695,13 +715,13 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req if (ret == -EMSGSIZE) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); ret = 0; } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); } rxrpc_tx_backoff(call, ret); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 62b09d23ec08c..6cb37b0eb77f4 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -257,8 +257,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -274,7 +273,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); txb->pkt_len += pad; } @@ -300,8 +299,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t content, pad; @@ -319,7 +317,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, txb->pkt_len = round_up(content, RXKAD_ALIGN); pad = txb->pkt_len - content; if (pad) - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); /* encrypt from the session key */ token = call->conn->key->payload.data[0]; @@ -407,9 +405,8 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) /* Clear excess space in the packet */ if (txb->pkt_len < txb->alloc_size) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; size_t gap = txb->alloc_size - txb->pkt_len; - void *p = whdr + 1; + void *p = txb->data; memset(p + txb->pkt_len, 0, gap); } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 584397aba4a07..84dc6c94f23b1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -419,7 +419,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); @@ -445,8 +445,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - - txb->kvec[0].iov_len += txb->len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 131d9e55c8e97..c550991d48faa 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -19,17 +19,19 @@ atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { - struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff; + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); - total = hoff + sizeof(*whdr) + data_size; + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); @@ -41,30 +43,15 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return NULL; } - whdr = buf + hoff; - refcount_set(&txb->ref, 1); txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->alloc_size = data_size; txb->space = data_size; - txb->offset = sizeof(*whdr); + txb->offset = 0; txb->flags = call->conn->out_clientflag; txb->seq = call->send_top + 1; - txb->nr_kvec = 1; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = htonl(txb->seq); - whdr->type = RXRPC_PACKET_TYPE_DATA; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); + txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); @@ -90,14 +77,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - int i; - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); - for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base && - !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) - page_frag_free(txb->kvec[i].iov_base); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } -- GitLab From e589adf5b70c07b1ab974d077046fdbf583b2f36 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Feb 2025 09:51:08 -0800 Subject: [PATCH 469/989] iavf: Fix a locking bug in an error path If the netdev lock has been obtained, unlock it before returning. This bug has been detected by the Clang thread-safety analyzer. Fixes: afc664987ab3 ("eth: iavf: extend the netdev_lock usage") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20250206175114.1974171-28-bvanassche@acm.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 2d7a18fcc3be4..852e5b62f0a5d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2903,8 +2903,8 @@ static void iavf_watchdog_task(struct work_struct *work) } mutex_unlock(&adapter->crit_lock); - netdev_unlock(netdev); restart_watchdog: + netdev_unlock(netdev); if (adapter->state >= __IAVF_DOWN) queue_work(adapter->wq, &adapter->adminq_task); if (adapter->aq_required) -- GitLab From 8743d66979e494c5378563e6b5a32e913380abd8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 11 Feb 2025 14:32:01 -0600 Subject: [PATCH 470/989] gpiolib: acpi: Add a quirk for Acer Nitro ANV14 Spurious immediate wake up events are reported on Acer Nitro ANV14. GPIO 11 is specified as an edge triggered input and also a wake source but this pin is supposed to be an output pin for an LED, so it's effectively floating. Block the interrupt from getting set up for this GPIO on this device. Cc: stable@vger.kernel.org Reported-by: Delgan Tested-by: Delgan Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3954 Signed-off-by: Mario Limonciello Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20250211203222.761206-1-superm1@kernel.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 1f9fe50bba005..f7746c57ba76a 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1689,6 +1689,20 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .ignore_wake = "PNP0C50:00@8", }, }, + { + /* + * Spurious wakeups from GPIO 11 + * Found in BIOS 1.04 + * https://gitlab.freedesktop.org/drm/amd/-/issues/3954 + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_FAMILY, "Acer Nitro V 14"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_interrupt = "AMDI0030:00@11", + }, + }, {} /* Terminating entry */ }; -- GitLab From d262a192d38e527faa5984629aabda2e0d1c4f54 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 12 Feb 2025 07:46:28 +0100 Subject: [PATCH 471/989] powerpc/code-patching: Fix KASAN hit by not flagging text patching area as VM_ALLOC Erhard reported the following KASAN hit while booting his PowerMac G4 with a KASAN-enabled kernel 6.13-rc6: BUG: KASAN: vmalloc-out-of-bounds in copy_to_kernel_nofault+0xd8/0x1c8 Write of size 8 at addr f1000000 by task chronyd/1293 CPU: 0 UID: 123 PID: 1293 Comm: chronyd Tainted: G W 6.13.0-rc6-PMacG4 #2 Tainted: [W]=WARN Hardware name: PowerMac3,6 7455 0x80010303 PowerMac Call Trace: [c2437590] [c1631a84] dump_stack_lvl+0x70/0x8c (unreliable) [c24375b0] [c0504998] print_report+0xdc/0x504 [c2437610] [c050475c] kasan_report+0xf8/0x108 [c2437690] [c0505a3c] kasan_check_range+0x24/0x18c [c24376a0] [c03fb5e4] copy_to_kernel_nofault+0xd8/0x1c8 [c24376c0] [c004c014] patch_instructions+0x15c/0x16c [c2437710] [c00731a8] bpf_arch_text_copy+0x60/0x7c [c2437730] [c0281168] bpf_jit_binary_pack_finalize+0x50/0xac [c2437750] [c0073cf4] bpf_int_jit_compile+0xb30/0xdec [c2437880] [c0280394] bpf_prog_select_runtime+0x15c/0x478 [c24378d0] [c1263428] bpf_prepare_filter+0xbf8/0xc14 [c2437990] [c12677ec] bpf_prog_create_from_user+0x258/0x2b4 [c24379d0] [c027111c] do_seccomp+0x3dc/0x1890 [c2437ac0] [c001d8e0] system_call_exception+0x2dc/0x420 [c2437f30] [c00281ac] ret_from_syscall+0x0/0x2c --- interrupt: c00 at 0x5a1274 NIP: 005a1274 LR: 006a3b3c CTR: 005296c8 REGS: c2437f40 TRAP: 0c00 Tainted: G W (6.13.0-rc6-PMacG4) MSR: 0200f932 CR: 24004422 XER: 00000000 GPR00: 00000166 af8f3fa0 a7ee3540 00000001 00000000 013b6500 005a5858 0200f932 GPR08: 00000000 00001fe9 013d5fc8 005296c8 2822244c 00b2fcd8 00000000 af8f4b57 GPR16: 00000000 00000001 00000000 00000000 00000000 00000001 00000000 00000002 GPR24: 00afdbb0 00000000 00000000 00000000 006e0004 013ce060 006e7c1c 00000001 NIP [005a1274] 0x5a1274 LR [006a3b3c] 0x6a3b3c --- interrupt: c00 The buggy address belongs to the virtual mapping at [f1000000, f1002000) created by: text_area_cpu_up+0x20/0x190 The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:00000000 index:0x0 pfn:0x76e30 flags: 0x80000000(zone=2) raw: 80000000 00000000 00000122 00000000 00000000 00000000 ffffffff 00000001 raw: 00000000 page dumped because: kasan: bad access detected Memory state around the buggy address: f0ffff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f0ffff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >f1000000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ^ f1000080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f1000100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ================================================================== f8 corresponds to KASAN_VMALLOC_INVALID which means the area is not initialised hence not supposed to be used yet. Powerpc text patching infrastructure allocates a virtual memory area using get_vm_area() and flags it as VM_ALLOC. But that flag is meant to be used for vmalloc() and vmalloc() allocated memory is not supposed to be used before a call to __vmalloc_node_range() which is never called for that area. That went undetected until commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") The area allocated by text_area_cpu_up() is not vmalloc memory, it is mapped directly on demand when needed by map_kernel_page(). There is no VM flag corresponding to such usage, so just pass no flag. That way the area will be unpoisonned and usable immediately. Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/20250112135832.57c92322@yea/ Fixes: 37bc3e5fd764 ("powerpc/lib/code-patching: Use alternate map for patch_instruction()") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/06621423da339b374f48c0886e3a5db18e896be8.1739342693.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 81c0f673eb252..f84e0337cc029 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -108,7 +108,7 @@ static int text_area_cpu_up(unsigned int cpu) unsigned long addr; int err; - area = get_vm_area(PAGE_SIZE, VM_ALLOC); + area = get_vm_area(PAGE_SIZE, 0); if (!area) { WARN_ONCE(1, "Failed to create text area for cpu %d\n", cpu); -- GitLab From cf56aa8dd26328a9af4ffe7fb0bd8fcfa9407112 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Feb 2025 13:25:57 +0100 Subject: [PATCH 472/989] Revert "netfilter: flowtable: teardown flow if cached mtu is stale" This reverts commit b8baac3b9c5cc4b261454ff87d75ae8306016ffd. IPv4 packets with no DF flag set on result in frequent flow entry teardown cycles, this is visible in the network topology that is used in the nft_flowtable.sh test. nft_flowtable.sh test ocassionally fails reporting that the dscp_fwd test sees no packets going through the flowtable path. Fixes: b8baac3b9c5c ("netfilter: flowtable: teardown flow if cached mtu is stale") Reported-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 97c6eb8847a02..8cd4cf7ae2112 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -381,10 +381,8 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4) + ctx->offset; @@ -662,10 +660,8 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) { - flow_offload_teardown(flow); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; - } ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); thoff = sizeof(*ip6h) + ctx->offset; -- GitLab From b9644fbfbcab13da7f8b37bef7c51e5b8407d031 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 12 Feb 2025 10:18:49 +0800 Subject: [PATCH 473/989] gpio: stmpe: Check return value of stmpe_reg_read in stmpe_gpio_irq_sync_unlock The stmpe_reg_read function can fail, but its return value is not checked in stmpe_gpio_irq_sync_unlock. This can lead to silent failures and incorrect behavior if the hardware access fails. This patch adds checks for the return value of stmpe_reg_read. If the function fails, an error message is logged and the function returns early to avoid further issues. Fixes: b888fb6f2a27 ("gpio: stmpe: i2c transfer are forbiden in atomic context") Cc: stable@vger.kernel.org # 4.16+ Signed-off-by: Wentao Liang Link: https://lore.kernel.org/r/20250212021849.275-1-vulab@iscas.ac.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-stmpe.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c index 75a3633ceddbb..222279a9d82b2 100644 --- a/drivers/gpio/gpio-stmpe.c +++ b/drivers/gpio/gpio-stmpe.c @@ -191,7 +191,7 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) [REG_IE][CSB] = STMPE_IDX_IEGPIOR_CSB, [REG_IE][MSB] = STMPE_IDX_IEGPIOR_MSB, }; - int i, j; + int ret, i, j; /* * STMPE1600: to be able to get IRQ from pins, @@ -199,8 +199,16 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) * GPSR or GPCR registers */ if (stmpe->partnum == STMPE1600) { - stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_LSB]); - stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_CSB]); + ret = stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_LSB]); + if (ret < 0) { + dev_err(stmpe->dev, "Failed to read GPMR_LSB: %d\n", ret); + goto err; + } + ret = stmpe_reg_read(stmpe, stmpe->regs[STMPE_IDX_GPMR_CSB]); + if (ret < 0) { + dev_err(stmpe->dev, "Failed to read GPMR_CSB: %d\n", ret); + goto err; + } } for (i = 0; i < CACHE_NR_REGS; i++) { @@ -222,6 +230,7 @@ static void stmpe_gpio_irq_sync_unlock(struct irq_data *d) } } +err: mutex_unlock(&stmpe_gpio->irq_lock); } -- GitLab From 56d5f3eba3f5de0efdd556de4ef381e109b973a9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Feb 2025 18:15:59 +0100 Subject: [PATCH 474/989] acct: perform last write from workqueue In [1] it was reported that the acct(2) system call can be used to trigger NULL deref in cases where it is set to write to a file that triggers an internal lookup. This can e.g., happen when pointing acc(2) to /sys/power/resume. At the point the where the write to this file happens the calling task has already exited and called exit_fs(). A lookup will thus trigger a NULL-deref when accessing current->fs. Reorganize the code so that the the final write happens from the workqueue but with the caller's credentials. This preserves the (strange) permission model and has almost no regression risk. This api should stop to exist though. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1] Link: https://lore.kernel.org/r/20250211-work-acct-v1-1-1c16aecab8b3@kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Zicheng Qu Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/acct.c | 120 +++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 50 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 31222e8cd534f..48283efe8a123 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -103,48 +103,50 @@ struct bsd_acct_struct { atomic_long_t count; struct rcu_head rcu; struct mutex lock; - int active; + bool active; + bool check_space; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; + acct_t ac; }; -static void do_acct_process(struct bsd_acct_struct *acct); +static void fill_ac(struct bsd_acct_struct *acct); +static void acct_write_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct) +static bool check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_after_jiffies(acct->needcheck)) - goto out; + if (!acct->check_space) + return acct->active; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) - goto out; + return acct->active; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { - acct->active = 0; + acct->active = false; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { - acct->active = 1; + acct->active = true; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; -out: return acct->active; } @@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); - do_acct_process(acct); + /* + * Fill the accounting struct with the exiting task's info + * before punting to the workqueue. + */ + fill_ac(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); @@ -202,6 +208,9 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; + + /* We were fired by acct_pin_kill() which holds acct->lock. */ + acct_write_process(acct); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -430,13 +439,27 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -static void fill_ac(acct_t *ac) +static void fill_ac(struct bsd_acct_struct *acct) { struct pacct_struct *pacct = ¤t->signal->pacct; + struct file *file = acct->file; + acct_t *ac = &acct->ac; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; + lockdep_assert_held(&acct->lock); + + if (time_is_after_jiffies(acct->needcheck)) { + acct->check_space = false; + + /* Don't fill in @ac if nothing will be written. */ + if (!acct->active) + return; + } else { + acct->check_space = true; + } + /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. @@ -484,64 +507,61 @@ static void fill_ac(acct_t *ac) ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); -} -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct) -{ - acct_t ac; - unsigned long flim; - const struct cred *orig_cred; - struct file *file = acct->file; - - /* - * Accounting records are not subject to resource limits. - */ - flim = rlimit(RLIMIT_FSIZE); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct)) - goto out; - - fill_ac(&ac); /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; + ac->ac_uid16 = ac->ac_uid; + ac->ac_gid16 = ac->ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; - ac.ac_pid = task_tgid_nr_ns(current, ns); + ac->ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), - ns); + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif +} + +static void acct_write_process(struct bsd_acct_struct *acct) +{ + struct file *file = acct->file; + const struct cred *cred; + acct_t *ac = &acct->ac; + + /* Perform file operations on behalf of whoever enabled accounting */ + cred = override_creds(file->f_cred); + /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. */ - if (file_start_write_trylock(file)) { + if (check_free_space(acct) && file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, &ac, sizeof(acct_t), &pos); + __kernel_write(file, ac, sizeof(acct_t), &pos); file_end_write(file); } -out: + + revert_creds(cred); +} + +static void do_acct_process(struct bsd_acct_struct *acct) +{ + unsigned long flim; + + /* Accounting records are not subject to resource limits. */ + flim = rlimit(RLIMIT_FSIZE); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + fill_ac(acct); + acct_write_process(acct); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - revert_creds(orig_cred); } /** -- GitLab From 890ed45bde808c422c3c27d3285fc45affa0f930 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Feb 2025 18:16:00 +0100 Subject: [PATCH 475/989] acct: block access to kernel internal filesystems There's no point in allowing anything kernel internal nor procfs or sysfs. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com Link: https://lore.kernel.org/r/20250211-work-acct-v1-2-1c16aecab8b3@kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Amir Goldstein Reported-by: Zicheng Qu Cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- kernel/acct.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/acct.c b/kernel/acct.c index 48283efe8a123..6520baa136693 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -243,6 +243,20 @@ static int acct_on(struct filename *pathname) return -EACCES; } + /* Exclude kernel kernel internal filesystems. */ + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + + /* Exclude procfs and sysfs. */ + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); -- GitLab From b3e127dacad60a384c92baafdc74f1508bf7dd47 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Tue, 11 Feb 2025 12:36:11 -0500 Subject: [PATCH 476/989] platform/x86: thinkpad_acpi: Fix registration of tpacpi platform driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent platform profile changes prevent the tpacpi platform driver from registering. This error is seen in the kernel logs, and the various tpacpi entries are not created: [ 7550.642171] platform thinkpad_acpi: Resources present before probing This happens because devm_platform_profile_register() is called before tpacpi_pdev probes (thanks to Kurt Borja for identifying the root cause). For now revert back to the old platform_profile_register to fix the issue. This is quick fix and will be re-implemented later as more testing is needed for full solution. Tested on X1 Carbon G12. Fixes: 31658c916fa6 ("platform/x86: thinkpad_acpi: Use devm_platform_profile_register()") Signed-off-by: Mark Pearson Reviewed-by: Kurt Borja Link: https://lore.kernel.org/r/20250211173620.16522-1-mpearson-lenovo@squebb.ca Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/thinkpad_acpi.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 99cdea723d32c..72a10ed2017ce 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -10668,8 +10668,8 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) "DYTC version %d: thermal mode available\n", dytc_version); /* Create platform_profile structure and register */ - tpacpi_pprof = devm_platform_profile_register(&tpacpi_pdev->dev, "thinkpad-acpi", - NULL, &dytc_profile_ops); + tpacpi_pprof = platform_profile_register(&tpacpi_pdev->dev, "thinkpad-acpi-profile", + NULL, &dytc_profile_ops); /* * If for some reason platform_profiles aren't enabled * don't quit terminally. @@ -10687,8 +10687,15 @@ static int tpacpi_dytc_profile_init(struct ibm_init_struct *iibm) return 0; } +static void dytc_profile_exit(void) +{ + if (!IS_ERR_OR_NULL(tpacpi_pprof)) + platform_profile_remove(tpacpi_pprof); +} + static struct ibm_struct dytc_profile_driver_data = { .name = "dytc-profile", + .exit = dytc_profile_exit, }; /************************************************************************* -- GitLab From e977499820782ab1c69f354d9f41b6d9ad1f43d9 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Mon, 10 Feb 2025 15:36:54 +0100 Subject: [PATCH 477/989] drm/xe: Carve out wopcm portion from the stolen memory The top of stolen memory is WOPCM, which shouldn't be accessed. Remove this portion from the stolen memory region for discrete platforms. This was already done for integrated, but was missing for discrete platforms. This also moves get_wopcm_size() so detect_bar2_dgfx() and detect_bar2_integrated can use the same function. v2: Improve commit message and suitable stable version tag(Lucas) Fixes: d8b52a02cb40 ("drm/xe: Implement stolen memory.") Cc: Maarten Lankhorst Cc: Matthew Auld Cc: Lucas De Marchi Cc: stable@vger.kernel.org # v6.11+ Reviewed-by: Lucas De Marchi Link: https://patchwork.freedesktop.org/patch/msgid/20250210143654.2076747-1-nirmoy.das@intel.com Signed-off-by: Nirmoy Das (cherry picked from commit 2c7f45cc7e197a792ce5c693e56ea48f60b312da) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c | 54 ++++++++++++++------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c index 423856cc18d40..d414421f8c131 100644 --- a/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_stolen_mgr.c @@ -57,12 +57,35 @@ bool xe_ttm_stolen_cpu_access_needs_ggtt(struct xe_device *xe) return GRAPHICS_VERx100(xe) < 1270 && !IS_DGFX(xe); } +static u32 get_wopcm_size(struct xe_device *xe) +{ + u32 wopcm_size; + u64 val; + + val = xe_mmio_read64_2x32(xe_root_tile_mmio(xe), STOLEN_RESERVED); + val = REG_FIELD_GET64(WOPCM_SIZE_MASK, val); + + switch (val) { + case 0x5 ... 0x6: + val--; + fallthrough; + case 0x0 ... 0x3: + wopcm_size = (1U << val) * SZ_1M; + break; + default: + WARN(1, "Missing case wopcm_size=%llx\n", val); + wopcm_size = 0; + } + + return wopcm_size; +} + static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) { struct xe_tile *tile = xe_device_get_root_tile(xe); struct xe_mmio *mmio = xe_root_tile_mmio(xe); struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - u64 stolen_size; + u64 stolen_size, wopcm_size; u64 tile_offset; u64 tile_size; @@ -74,7 +97,13 @@ static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) if (drm_WARN_ON(&xe->drm, tile_size < mgr->stolen_base)) return 0; + /* Carve out the top of DSM as it contains the reserved WOPCM region */ + wopcm_size = get_wopcm_size(xe); + if (drm_WARN_ON(&xe->drm, !wopcm_size)) + return 0; + stolen_size = tile_size - mgr->stolen_base; + stolen_size -= wopcm_size; /* Verify usage fits in the actual resource available */ if (mgr->stolen_base + stolen_size <= pci_resource_len(pdev, LMEM_BAR)) @@ -89,29 +118,6 @@ static s64 detect_bar2_dgfx(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) return ALIGN_DOWN(stolen_size, SZ_1M); } -static u32 get_wopcm_size(struct xe_device *xe) -{ - u32 wopcm_size; - u64 val; - - val = xe_mmio_read64_2x32(xe_root_tile_mmio(xe), STOLEN_RESERVED); - val = REG_FIELD_GET64(WOPCM_SIZE_MASK, val); - - switch (val) { - case 0x5 ... 0x6: - val--; - fallthrough; - case 0x0 ... 0x3: - wopcm_size = (1U << val) * SZ_1M; - break; - default: - WARN(1, "Missing case wopcm_size=%llx\n", val); - wopcm_size = 0; - } - - return wopcm_size; -} - static u32 detect_bar2_integrated(struct xe_device *xe, struct xe_ttm_stolen_mgr *mgr) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); -- GitLab From 06521ac0485effdcc9c792cb0b40ed8e6f2f5fb8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Feb 2025 13:33:24 +0000 Subject: [PATCH 478/989] io_uring/waitid: don't abuse io_tw_state struct io_tw_state is managed by core io_uring, and opcode handling code must never try to cheat and create their own instances, it's plain incorrect. io_waitid_complete() attempts exactly that outside of the task work context, and even though the ring is locked, there would be no one to reap the requests from the defer completion list. It only works now because luckily it's called before io_uring_try_cancel_uring_cmd(), which flushes completions. Fixes: f31ecf671ddc4 ("io_uring: add IORING_OP_WAITID support") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/waitid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 853e97a7b0ecb..c4096d93a2870 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,6 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); @@ -131,7 +130,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - io_req_task_complete(req, &ts); } static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -153,6 +151,7 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); + io_req_queue_tw_complete(req, -ECANCELED); return true; } @@ -258,6 +257,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); + io_req_task_complete(req, ts); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, -- GitLab From 8802766324e1f5d414a81ac43365c20142e85603 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 12 Feb 2025 13:46:46 +0000 Subject: [PATCH 479/989] io_uring/kbuf: reallocate buf lists on upgrade IORING_REGISTER_PBUF_RING can reuse an old struct io_buffer_list if it was created for legacy selected buffer and has been emptied. It violates the requirement that most of the field should stay stable after publish. Always reallocate it instead. Cc: stable@vger.kernel.org Reported-by: Pumpkin Chang Fixes: 2fcabce2d7d34 ("io_uring: disallow mixed provided buffer group registrations") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 04bf493eecae0..8e72de7712ac9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -415,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) } } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); +} + int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); @@ -636,12 +643,13 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) /* if mapped buffer ring OR classic exists, don't allow */ if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; ring_size = flex_array_size(br, bufs, reg.ring_entries); -- GitLab From a8de7f100bb5989d9c3627d3a223ee1c863f3b69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:51 -0800 Subject: [PATCH 480/989] KVM: x86: Reject Hyper-V's SEND_IPI hypercalls if local APIC isn't in-kernel Advertise support for Hyper-V's SEND_IPI and SEND_IPI_EX hypercalls if and only if the local API is emulated/virtualized by KVM, and explicitly reject said hypercalls if the local APIC is emulated in userspace, i.e. don't rely on userspace to opt-in to KVM_CAP_HYPERV_ENFORCE_CPUID. Rejecting SEND_IPI and SEND_IPI_EX fixes a NULL-pointer dereference if Hyper-V enlightenments are exposed to the guest without an in-kernel local APIC: dump_stack+0xbe/0xfd __kasan_report.cold+0x34/0x84 kasan_report+0x3a/0x50 __apic_accept_irq+0x3a/0x5c0 kvm_hv_send_ipi.isra.0+0x34e/0x820 kvm_hv_hypercall+0x8d9/0x9d0 kvm_emulate_hypercall+0x506/0x7e0 __vmx_handle_exit+0x283/0xb60 vmx_handle_exit+0x1d/0xd0 vcpu_enter_guest+0x16b0/0x24c0 vcpu_run+0xc0/0x550 kvm_arch_vcpu_ioctl_run+0x170/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscal1_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 Note, checking the sending vCPU is sufficient, as the per-VM irqchip_mode can't be modified after vCPUs are created, i.e. if one vCPU has an in-kernel local APIC, then all vCPUs have an in-kernel local APIC. Reported-by: Dongjie Zou Fixes: 214ff83d4473 ("KVM: x86: hyperv: implement PV IPI send hypercalls") Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f228..6ebeb6cea6c0d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; -- GitLab From 0b6db0dc43eefb4f89181546785c3609fd276524 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:52 -0800 Subject: [PATCH 481/989] KVM: selftests: Mark test_hv_cpuid_e2big() static in Hyper-V CPUID test Make the Hyper-V CPUID test's local helper test_hv_cpuid_e2big() static, it's not used outside of the test (and isn't intended to be). Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 4f5881d4ef66d..9a0fcc7133503 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -111,7 +111,7 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, } } -void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 cpuid = {.nent = 0}; int ret; -- GitLab From cd5a0c2f0faeb4a3fab3b78f6693a2d55ee51efa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:53 -0800 Subject: [PATCH 482/989] KVM: selftests: Manage CPUID array in Hyper-V CPUID test's core helper Allocate, get, and free the CPUID array in the Hyper-V CPUID test in the test's core helper, instead of copy+pasting code at each call site. In addition to deduplicating a small amount of code, restricting visibility of the array to a single invocation of the core test prevents "leaking" an array across test cases. Passing in @vcpu to the helper will also allow pivoting on VM-scoped information without needing to pass more booleans, e.g. to conditionally assert on features that require an in-kernel APIC. To avoid use-after-free bugs due to overzealous and careless developers, opportunstically add a comment to explain that the system-scoped helper caches the Hyper-V CPUID entries, i.e. that the caller is not responsible for freeing the memory. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-4-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86/hyperv_cpuid.c | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 9a0fcc7133503..3188749ec6e12 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -41,13 +41,18 @@ static bool smt_possible(void) return res; } -static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, - bool evmcs_expected) +static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; u32 test_val; + if (vcpu) + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); + else + hv_cpuid_entries = kvm_get_supported_hv_cpuid(); + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" " (returned %d)", @@ -109,6 +114,13 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } + + /* + * Note, the CPUID array returned by the system-scoped helper is a one- + * time allocation, i.e. must not be freed. + */ + if (vcpu) + free((void *)hv_cpuid_entries); } static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) @@ -129,7 +141,6 @@ static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { struct kvm_vm *vm; - const struct kvm_cpuid2 *hv_cpuid_entries; struct kvm_vcpu *vcpu; TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); @@ -138,10 +149,7 @@ int main(int argc, char *argv[]) /* Test vCPU ioctl version */ test_hv_cpuid_e2big(vm, vcpu); - - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, false); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, false); if (!kvm_cpu_has(X86_FEATURE_VMX) || !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { @@ -149,9 +157,7 @@ int main(int argc, char *argv[]) goto do_sys; } vcpu_enable_evmcs(vcpu); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, true); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, true); do_sys: /* Test system ioctl version */ @@ -161,9 +167,7 @@ int main(int argc, char *argv[]) } test_hv_cpuid_e2big(vm, NULL); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(); - test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX)); + test_hv_cpuid(NULL, kvm_cpu_has(X86_FEATURE_VMX)); out: kvm_vm_free(vm); -- GitLab From e36454461c5ebe6372952560b2abad5dc9ac579d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:54 -0800 Subject: [PATCH 483/989] KVM: selftests: Add CPUID tests for Hyper-V features that need in-kernel APIC Add testcases to x86's Hyper-V CPUID test to verify that KVM advertises support for features that require an in-kernel local APIC appropriately, i.e. that KVM hides support from the vCPU-scoped ioctl if the VM doesn't have an in-kernel local APIC. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 3188749ec6e12..4e920705681ae 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -43,6 +43,7 @@ static bool smt_possible(void) static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const bool has_irqchip = !vcpu || vcpu->vm->has_irqchip; const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; @@ -85,12 +86,19 @@ static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) entry->eax, evmcs_expected ); break; + case 0x40000003: + TEST_ASSERT(has_irqchip || !(entry->edx & BIT(19)), + "\"Direct\" Synthetic Timers should require in-kernel APIC"); + break; case 0x40000004: test_val = entry->eax & (1UL << 18); TEST_ASSERT(!!test_val == !smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); + + TEST_ASSERT(has_irqchip || !(entry->eax & BIT(10)), + "Cluster IPI (i.e. SEND_IPI) should require in-kernel APIC"); break; case 0x4000000A: TEST_ASSERT(entry->eax & (1UL << 19), @@ -145,9 +153,14 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); + /* Test the vCPU ioctl without an in-kernel local APIC. */ + vm = vm_create_barebones(); + vcpu = __vm_vcpu_add(vm, 0); + test_hv_cpuid(vcpu, false); + kvm_vm_free(vm); /* Test vCPU ioctl version */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); test_hv_cpuid_e2big(vm, vcpu); test_hv_cpuid(vcpu, false); -- GitLab From 46d6c6f3ef0eaff71c2db6d77d4e2ebb7adac34f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Jan 2025 17:08:25 -0800 Subject: [PATCH 484/989] KVM: nSVM: Enter guest mode before initializing nested NPT MMU When preparing vmcb02 for nested VMRUN (or state restore), "enter" guest mode prior to initializing the MMU for nested NPT so that guest_mode is set in the MMU's role. KVM's model is that all L2 MMUs are tagged with guest_mode, as the behavior of hypervisor MMUs tends to be significantly different than kernel MMUs. Practically speaking, the bug is relatively benign, as KVM only directly queries role.guest_mode in kvm_mmu_free_guest_mode_roots() and kvm_mmu_page_ad_need_write_protect(), which SVM doesn't use, and in paths that are optimizations (mmu_page_zap_pte() and shadow_mmu_try_split_huge_pages()). And while the role is incorprated into shadow page usage, because nested NPT requires KVM to be using NPT for L1, reusing shadow pages across L1 and L2 is impossible as L1 MMUs will always have direct=1, while L2 MMUs will have direct=0. Hoist the TLB processing and setting of HF_GUEST_MASK to the beginning of the flow instead of forcing guest_mode in the MMU, as nothing in nested_vmcb02_prepare_control() between the old and new locations touches TLB flush requests or HF_GUEST_MASK, i.e. there's no reason to present inconsistent vCPU state to the MMU. Fixes: 69cb877487de ("KVM: nSVM: move MMU setup to nested_prepare_vmcb_control") Cc: stable@vger.kernel.org Reported-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20250130010825.220346-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74c20dbb92dae..d4ac4a1f8b81b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5540,7 +5540,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d6..04c375bf1ac2a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. -- GitLab From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jan 2025 17:18:33 -0800 Subject: [PATCH 485/989] KVM: x86: Load DR6 with guest value only before entering .vcpu_run() loop Move the conditional loading of hardware DR6 with the guest's DR6 value out of the core .vcpu_run() loop to fix a bug where KVM can load hardware with a stale vcpu->arch.dr6. When the guest accesses a DR and host userspace isn't debugging the guest, KVM disables DR interception and loads the guest's values into hardware on VM-Enter and saves them on VM-Exit. This allows the guest to access DRs at will, e.g. so that a sequence of DR accesses to configure a breakpoint only generates one VM-Exit. For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest) and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop. But for DR6, the guest's value doesn't need to be loaded into hardware for KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas VMX requires software to manually load the guest value, and so loading the guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done _inside_ the core run loop. Unfortunately, saving the guest values on VM-Exit is initiated by common x86, again outside of the core run loop. If the guest modifies DR6 (in hardware, when DR interception is disabled), and then the next VM-Exit is a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and clobber the guest's actual value. The bug shows up primarily with nested VMX because KVM handles the VMX preemption timer in the fastpath, and the window between hardware DR6 being modified (in guest context) and DR6 being read by guest software is orders of magnitude larger in a nested setup. E.g. in non-nested, the VMX preemption timer would need to fire precisely between #DB injection and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the window where hardware DR6 is "dirty" extends all the way from L1 writing DR6 to VMRESUME (in L1). L1's view: ========== CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0 A: L1 Writes DR6 CPU 0/KVM-7289 [023] d.... 2925.640963: : Set DRs, DR6 = 0xffff0ff1 B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec D: L1 reads DR6, arch.dr6 = 0 CPU 0/KVM-7289 [023] d.... 2925.640969: : Sync DRs, DR6 = 0xffff0ff0 CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0 L2 reads DR6, L1 disables DR interception CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216 CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0 CPU 0/KVM-7289 [023] d.... 2925.640983: : Set DRs, DR6 = 0xffff0ff0 L2 detects failure CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT L1 reads DR6 (confirms failure) CPU 0/KVM-7289 [023] d.... 2925.640990: : Sync DRs, DR6 = 0xffff0ff0 L0's view: ========== L2 reads DR6, arch.dr6 = 0 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 L2 => L1 nested VM-Exit CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23 L1 writes DR7, L0 disables DR interception CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007 CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23 L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005613: : Set DRs, DR6 = 0xffff0ff0 A: B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23 C: L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005614: : Set DRs, DR6 = 0xffff0ff0 L1 => L2 nested VM-Enter CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME L0 reads DR6, arch.dr6 = 0 Reported-by: John Stultz Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bwi8iZZg%40mail.gmail.com Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT") Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6") Cc: stable@vger.kernel.org Cc: Jim Mattson Tested-by: John Stultz Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 13 ++++++------- arch/x86/kvm/vmx/main.c | 1 + arch/x86/kvm/vmx/vmx.c | 10 ++++++---- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 3 +++ 7 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c35550581da0c..823c0434bbad1 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -48,6 +48,7 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5ca..0b7af5902ff75 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1696,6 +1696,7 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a6..a713c803a3a37 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e7638..43ee9ed11291b 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d5..6c56d5235f0f3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c048..430773a5ef8e3 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd4..02159c967d29e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } -- GitLab From 34cae91215c6f65bed2a124fb9283da6ec0b8dd9 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 12 Feb 2025 13:45:45 -0700 Subject: [PATCH 486/989] io_uring/uring_cmd: don't assume io_uring_cmd_data layout eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout") removed most of the places assuming struct io_uring_cmd_data has sqes as its first field. However, the EAGAIN case in io_uring_cmd() still compares ioucmd->sqe to the struct io_uring_cmd_data pointer using a void * cast. Since fa3595523d72 ("io_uring: get rid of alloc cache init_once handling"), sqes is no longer io_uring_cmd_data's first field. As a result, the pointers will always compare unequal and memcpy() may be called with the same source and destination. Replace the incorrect void * cast with the address of the sqes field. Signed-off-by: Caleb Sander Mateos Fixes: eaf72f7b414f ("io_uring/uring_cmd: cleanup struct io_uring_cmd_data layout") Link: https://lore.kernel.org/r/20250212204546.3751645-2-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1f6a82128b475..cfb22e1de0e75 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -252,7 +252,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EAGAIN) { struct io_uring_cmd_data *cache = req->async_data; - if (ioucmd->sqe != (void *) cache) + if (ioucmd->sqe != cache->sqes) memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); return -EAGAIN; } else if (ret == -EIOCBQUEUED) { -- GitLab From e663da62ba8672aaa66843f1af8b20e3bb1a0515 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 12 Feb 2025 13:45:46 -0700 Subject: [PATCH 487/989] io_uring/uring_cmd: switch sqe to async_data on EAGAIN 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") moved the unconditional memcpy() of the uring_cmd SQE to async_data to 2 cases when the request goes async: - If REQ_F_FORCE_ASYNC is set to force the initial issue to go async - If ->uring_cmd() returns -EAGAIN in the initial non-blocking issue Unlike the REQ_F_FORCE_ASYNC case, in the EAGAIN case, io_uring_cmd() copies the SQE to async_data but neglects to update the io_uring_cmd's sqe field to point to async_data. As a result, sqe still points to the slot in the userspace-mapped SQ. At the end of io_submit_sqes(), the kernel advances the SQ head index, allowing userspace to reuse the slot for a new SQE. If userspace reuses the slot before the io_uring worker reissues the original SQE, the io_uring_cmd's SQE will be corrupted. Introduce a helper io_uring_cmd_cache_sqes() to copy the original SQE to the io_uring_cmd's async_data and point sqe there. Use it for both the REQ_F_FORCE_ASYNC and EAGAIN cases. This ensures the uring_cmd doesn't read from the SQ slot after it has been returned to userspace. Signed-off-by: Caleb Sander Mateos Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") Link: https://lore.kernel.org/r/20250212204546.3751645-3-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index cfb22e1de0e75..bcfca18395c4c 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -168,6 +168,15 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); +static void io_uring_cmd_cache_sqes(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_uring_cmd_data *cache = req->async_data; + + memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; +} + static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -179,14 +188,10 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return -ENOMEM; cache->op_data = NULL; - if (!(req->flags & REQ_F_FORCE_ASYNC)) { - /* defer memcpy until we need it */ - ioucmd->sqe = sqe; - return 0; - } - - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; + ioucmd->sqe = sqe; + /* defer memcpy until we need it */ + if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) + io_uring_cmd_cache_sqes(req); return 0; } @@ -253,7 +258,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != cache->sqes) - memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + io_uring_cmd_cache_sqes(req); return -EAGAIN; } else if (ret == -EIOCBQUEUED) { return -EIOCBQUEUED; -- GitLab From 472ff48e2c09e49f2f90eeb6922f747306559506 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 12 Feb 2025 11:53:32 -0700 Subject: [PATCH 488/989] PCI: Fix BUILD_BUG_ON usage for old gcc As reported in the below link, it seems older versions of gcc cannot determine that the howmany variable is known for all callers. Include a test so that newer compilers can enforce this sanity check and older compilers can still work. Add __always_inline attribute to give the compiler an even better chance to know the inputs. Link: https://lore.kernel.org/r/20250212185337.293023-1-alex.williamson@redhat.com Fixes: 4453f360862e ("PCI: Batch BAR sizing operations") Reported-by: Oleg Nesterov Link: https://lore.kernel.org/all/20250209154512.GA18688@redhat.com Signed-off-by: Alex Williamson Signed-off-by: Bjorn Helgaas Tested-by: Oleg Nesterov Tested-by: Mitchell Augustin --- drivers/pci/probe.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b6536ed599c37..246744d8d268a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -339,13 +339,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, return (res->flags & IORESOURCE_MEM_64) ? 1 : 0; } -static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) +static __always_inline void pci_read_bases(struct pci_dev *dev, + unsigned int howmany, int rom) { u32 rombar, stdbars[PCI_STD_NUM_BARS]; unsigned int pos, reg; u16 orig_cmd; - BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS); + BUILD_BUG_ON(statically_true(howmany > PCI_STD_NUM_BARS)); if (dev->non_compliant_bars) return; -- GitLab From cee6f9a9c87b6ecfb51845950c28216b231c3610 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 12 Jan 2025 15:39:51 +0100 Subject: [PATCH 489/989] objtool/rust: add one more `noreturn` Rust function Starting with Rust 1.85.0 (currently in beta, to be released 2025-02-20), under some kernel configurations with `CONFIG_RUST_DEBUG_ASSERTIONS=y`, one may trigger a new `objtool` warning: rust/kernel.o: warning: objtool: _R...securityNtB2_11SecurityCtx8as_bytes() falls through to next function _R...core3ops4drop4Drop4drop() due to a call to the `noreturn` symbol: core::panicking::assert_failed:: Thus add it to the list so that `objtool` knows it is actually `noreturn`. Do so matching with `strstr` since it is a generic. See commit 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") for more details. Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Fixes: 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20250112143951.751139-1-ojeda@kernel.org [ Updated Cc: stable@ to include 6.13.y. - Miguel ] Signed-off-by: Miguel Ojeda --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 753dbc4f81985..a027d1c0bb2b0 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -227,6 +227,7 @@ static bool is_rust_noreturn(const struct symbol *func) str_ends_with(func->name, "_4core9panicking18panic_bounds_check") || str_ends_with(func->name, "_4core9panicking19assert_failed_inner") || str_ends_with(func->name, "_4core9panicking36panic_misaligned_pointer_dereference") || + strstr(func->name, "_4core9panicking13assert_failed") || strstr(func->name, "_4core9panicking11panic_const24panic_const_") || (strstr(func->name, "_4core5slice5index24slice_") && str_ends_with(func->name, "_fail")); -- GitLab From 2e4f982cf392af2f1282b5537a72144e064799e3 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 7 Feb 2025 00:20:22 +0100 Subject: [PATCH 490/989] rust: rbtree: fix overindented list item Starting with Rust 1.86.0 (to be released 2025-04-03), Clippy will have a new lint, `doc_overindented_list_items` [1], which catches cases of overindented list items. The lint has been added by Yutaro Ohno, based on feedback from the kernel [2] on a patch that fixed a similar case -- commit 0c5928deada1 ("rust: block: fix formatting in GenDisk doc"). Clippy reports a few cases in the kernel, apart from the one already fixed in the commit above. One is this one: error: doc list item overindented --> rust/kernel/rbtree.rs:1152:5 | 1152 | /// null, it is a pointer to the root of the [`RBTree`]. | ^^^^ help: try using ` ` (2 spaces) | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_overindented_list_items = note: `-D clippy::doc-overindented-list-items` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(clippy::doc_overindented_list_items)]` Thus clean it up. Cc: Yutaro Ohno Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Fixes: a335e9591404 ("rust: rbtree: add `RBTree::entry`") Link: https://github.com/rust-lang/rust-clippy/pull/13711 [1] Link: https://github.com/rust-lang/rust-clippy/issues/13601 [2] Reviewed-by: Alice Ryhl Reviewed-by: Yutaro Ohno Link: https://lore.kernel.org/r/20250206232022.599998-1-ojeda@kernel.org [ There are a few other cases, so updated message. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/rbtree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs index ee2731dad72d9..0d1e75810664e 100644 --- a/rust/kernel/rbtree.rs +++ b/rust/kernel/rbtree.rs @@ -1149,7 +1149,7 @@ pub struct VacantEntry<'a, K, V> { /// # Invariants /// - `parent` may be null if the new node becomes the root. /// - `child_field_of_parent` is a valid pointer to the left-child or right-child of `parent`. If `parent` is -/// null, it is a pointer to the root of the [`RBTree`]. +/// null, it is a pointer to the root of the [`RBTree`]. struct RawVacantEntry<'a, K, V> { rbtree: *mut RBTree, /// The node that will become the parent of the new node if we insert one. -- GitLab From 0edf1283a9d1419a2095b4fcdd95c11ac00a191c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2025 14:05:11 -0700 Subject: [PATCH 491/989] io_uring/uring_cmd: remove dead req_has_async_data() check Any uring_cmd always has async data allocated now, there's no reason to check and clear a cached copy of the SQE. Fixes: d10f19dff56e ("io_uring/uring_cmd: switch to always allocating async data") Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index bcfca18395c4c..8af7780407b7e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -54,9 +54,6 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | IO_URING_F_COMPLETE_DEFER); ret = true; -- GitLab From 5298b7cffa8461009a4410f4e23f1c50ade39182 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 4 Dec 2024 08:48:27 +0100 Subject: [PATCH 492/989] um: add back support for FXSAVE registers It was reported that qemu may not enable the XSTATE CPU extension, which is a requirement after commit 3f17fed21491 ("um: switch to regset API and depend on XSTATE"). Add a fallback to use FXSAVE (FP registers on x86_64 and XFP on i386) which is just a shorter version of the same data. The only difference is that the XSTATE magic should not be set in the signal frame. Note that this still drops support for the older i386 FP register layout as supporting this would require more backward compatibility to build a correct signal frame. Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Reported-by: SeongJae Park Closes: https://lore.kernel.org/r/20241203070218.240797-1-sj@kernel.org Tested-by: SeongJae Park Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241204074827.1582917-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/os-Linux/registers.c | 21 ++++++++++++++++++--- arch/x86/um/signal.c | 5 +++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 76eaeb93928cc..eb1cdadc8a61d 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -18,6 +18,7 @@ #include #include +static unsigned long ptrace_regset; unsigned long host_fp_size; int get_fp_registers(int pid, unsigned long *regs) @@ -27,7 +28,7 @@ int get_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -39,7 +40,7 @@ int put_fp_registers(int pid, unsigned long *regs) .iov_len = host_fp_size, }; - if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) + if (ptrace(PTRACE_SETREGSET, pid, ptrace_regset, &iov) < 0) return -errno; return 0; } @@ -58,9 +59,23 @@ int arch_init_registers(int pid) return -ENOMEM; /* GDB has x86_xsave_length, which uses x86_cpuid_count */ - ret = ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); + ptrace_regset = NT_X86_XSTATE; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); if (ret) ret = -errno; + + if (ret == -ENODEV) { +#ifdef CONFIG_X86_32 + ptrace_regset = NT_PRXFPREG; +#else + ptrace_regset = NT_PRFPREG; +#endif + iov.iov_len = 2 * 1024 * 1024; + ret = ptrace(PTRACE_GETREGSET, pid, ptrace_regset, &iov); + if (ret) + ret = -errno; + } + munmap(iov.iov_base, 2 * 1024 * 1024); host_fp_size = iov.iov_len; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 75087e85b6fdb..ea5b3bcc42456 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -187,7 +187,12 @@ static int copy_sc_to_user(struct sigcontext __user *to, * Put magic/size values for userspace. We do not bother to verify them * later on, however, userspace needs them should it try to read the * XSTATE data. And ptrace does not fill in these parts. + * + * Skip this if we do not have an XSTATE frame. */ + if (host_fp_size <= sizeof(to_fp64->fpstate)) + return 0; + BUILD_BUG_ON(sizeof(int) != FP_XSTATE_MAGIC2_SIZE); #ifdef CONFIG_X86_32 __put_user(offsetof(struct _fpstate_32, _fxsr_env) + -- GitLab From 8891b176d350ec5ea9a39c6ef4c99bd63d68e64c Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 17 Dec 2024 21:27:44 +0100 Subject: [PATCH 493/989] um: avoid copying FP state from init_task The init_task instance of struct task_struct is statically allocated and does not contain the dynamic area for the userspace FP registers. As such, limit the copy to the valid area of init_task and fill the rest with zero. Note that the FP state is only needed for userspace, and as such it is entirely reasonable for init_task to not contain it. Reported-by: Brian Norris Closes: https://lore.kernel.org/Z1ySXmjZm-xOqk90@google.com Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20241217202745.1402932-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/process.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index e5a2d4d897e0c..0cd6fad3d908d 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -191,7 +191,15 @@ void initial_thread_cb(void (*proc)(void *), void *arg) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + return 0; } -- GitLab From 3c2fc7434d90338cf4c1b37bc95994208d23bfc6 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 7 Jan 2025 14:35:09 +0100 Subject: [PATCH 494/989] um: properly align signal stack on x86_64 The stack needs to be properly aligned so 16 byte memory accesses on the stack are correct. This was broken when introducing the dynamic math register sizing as the rounding was not moved appropriately. Fixes: 3f17fed21491 ("um: switch to regset API and depend on XSTATE") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250107133509.265576-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/signal.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index ea5b3bcc42456..2934e170b0fe0 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -372,11 +372,13 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig, int err = 0, sig = ksig->sig; unsigned long fp_to; - frame = (struct rt_sigframe __user *) - round_down(stack_top - sizeof(struct rt_sigframe), 16); + frame = (void __user *)stack_top - sizeof(struct rt_sigframe); /* Add required space for math frame */ - frame = (struct rt_sigframe __user *)((unsigned long)frame - math_size); + frame = (void __user *)((unsigned long)frame - math_size); + + /* ABI requires 16 byte boundary alignment */ + frame = (void __user *)round_down((unsigned long)frame, 16); /* Subtract 128 for a red zone and 8 for proper alignment */ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); -- GitLab From f82a9e7b9fa922bb9cccb00aae684a27b79e6df7 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 13 Jan 2025 10:41:07 +0100 Subject: [PATCH 495/989] um: fix execve stub execution on old host OSs The stub execution uses the somewhat new close_range and execveat syscalls. Of these two, the execveat call is essential, but the close_range call is more about stub process hygiene rather than safety (and its result is ignored). Replace both calls with a raw syscall as older machines might not have a recent enough kernel for close_range (with CLOSE_RANGE_CLOEXEC) or a libc that does not yet expose both of the syscalls. Fixes: 32e8eaf263d9 ("um: use execveat to create userspace MMs") Reported-by: Glenn Washburn Closes: https://lore.kernel.org/20250108022404.05e0de1e@crass-HP-ZBook-15-G2 Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250113094107.674738-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/os-Linux/skas/process.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index f683cfc9e51a5..e2f8f156402f5 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -181,6 +181,10 @@ extern char __syscall_stub_start[]; static int stub_exe_fd; +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + static int userspace_tramp(void *stack) { char *const argv[] = { "uml-userspace", NULL }; @@ -202,8 +206,12 @@ static int userspace_tramp(void *stack) init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); init_data.stub_data_offset = MMAP_OFFSET(offset); - /* Set CLOEXEC on all FDs and then unset on all memory related FDs */ - close_range(0, ~0U, CLOSE_RANGE_CLOEXEC); + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); fcntl(init_data.stub_data_fd, F_SETFD, 0); for (iomem = iomem_regions; iomem; iomem = iomem->next) @@ -224,7 +232,9 @@ static int userspace_tramp(void *stack) if (ret != sizeof(init_data)) exit(4); - execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH); + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); exit(5); } -- GitLab From 5b166b782d327f4b66190cc43afd3be36f2b3b7a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:04 +0100 Subject: [PATCH 496/989] um: virt-pci: don't use kmalloc() This code can be called deep in the IRQ handling, for example, and then cannot normally use kmalloc(). Have its own pre-allocated memory and use from there instead so this doesn't occur. Only in the (very rare) case of memcpy_toio() we'd still need to allocate memory. Link: https://patch.msgid.link/20250110125550.32479-6-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virt-pci.c | 198 +++++++++++++++++++------------------ 1 file changed, 102 insertions(+), 96 deletions(-) diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 744e7f31e8ef1..dd5580f975cc0 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -25,8 +25,10 @@ #define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) #define NUM_IRQ_MSGS 10 -#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1)) -#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1) +struct um_pci_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; struct um_pci_device { struct virtio_device *vdev; @@ -36,6 +38,11 @@ struct um_pci_device { struct virtqueue *cmd_vq, *irq_vq; +#define UM_PCI_WRITE_BUFS 20 + struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1]; + void *extra_ptrs[UM_PCI_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS); + #define UM_PCI_STAT_WAITING 0 unsigned long status; @@ -61,12 +68,40 @@ static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; static unsigned int um_pci_max_delay_us = 40000; module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); -struct um_pci_message_buffer { - struct virtio_pcidev_msg hdr; - u8 data[8]; -}; +static int um_pci_get_buf(struct um_pci_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } -static struct um_pci_message_buffer __percpu *um_pci_msg_bufs; + *posted = false; + return UM_PCI_WRITE_BUFS; +} + +static void um_pci_free_buf(struct um_pci_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) { + kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]); + dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} static int um_pci_send_cmd(struct um_pci_device *dev, struct virtio_pcidev_msg *cmd, @@ -82,7 +117,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev, }; struct um_pci_message_buffer *buf; int delay_count = 0; + bool bounce_out; int ret, len; + int buf_idx; bool posted; if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) @@ -101,26 +138,28 @@ static int um_pci_send_cmd(struct um_pci_device *dev, break; } - buf = get_cpu_var(um_pci_msg_bufs); - if (buf) - memcpy(buf, cmd, cmd_size); + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); - if (posted) { - u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); - - if (ncmd) { - memcpy(ncmd, cmd, cmd_size); - if (extra) - memcpy(ncmd + cmd_size, extra, extra_size); - cmd = (void *)ncmd; - cmd_size += extra_size; - extra = NULL; - extra_size = 0; - } else { - /* try without allocating memory */ - posted = false; - cmd = (void *)buf; + buf_idx = um_pci_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + um_pci_free_buf(dev, buf); + return -ENOMEM; } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; } else { cmd = (void *)buf; } @@ -128,39 +167,40 @@ static int um_pci_send_cmd(struct um_pci_device *dev, sg_init_one(&out_sg, cmd, cmd_size); if (extra) sg_init_one(&extra_sg, extra, extra_size); - if (out) + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) sg_init_one(&in_sg, out, out_size); /* add to internal virtio queue */ ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, extra ? 2 : 1, out ? 1 : 0, - posted ? cmd : HANDLE_NO_FREE(cmd), - GFP_ATOMIC); + cmd, GFP_ATOMIC); if (ret) { - if (posted) - kfree(cmd); - goto out; + um_pci_free_buf(dev, buf); + return ret; } if (posted) { virtqueue_kick(dev->cmd_vq); - ret = 0; - goto out; + return 0; } /* kick and poll for getting a response on the queue */ set_bit(UM_PCI_STAT_WAITING, &dev->status); virtqueue_kick(dev->cmd_vq); + ret = 0; while (1) { void *completed = virtqueue_get_buf(dev->cmd_vq, &len); - if (completed == HANDLE_NO_FREE(cmd)) + if (completed == buf) break; - if (completed && !HANDLE_IS_NO_FREE(completed)) - kfree(completed); + if (completed) + um_pci_free_buf(dev, completed); if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || ++delay_count > um_pci_max_delay_us, @@ -172,8 +212,11 @@ static int um_pci_send_cmd(struct um_pci_device *dev, } clear_bit(UM_PCI_STAT_WAITING, &dev->status); -out: - put_cpu_var(um_pci_msg_bufs); + if (bounce_out) + memcpy(out, buf->data, out_size); + + um_pci_free_buf(dev, buf); + return ret; } @@ -187,20 +230,13 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, .size = size, .addr = offset, }; - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - size_t bytes = sizeof(buf->data); + /* max 8, we might not use it all */ + u8 data[8]; if (!dev) return ULONG_MAX; - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; - - if (buf) - memset(data, 0xff, bytes); + memset(data, 0xff, sizeof(data)); switch (size) { case 1: @@ -212,34 +248,26 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) - goto out; + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -312,13 +340,8 @@ static void um_pci_bar_copy_from(void *priv, void *buffer, static unsigned long um_pci_bar_read(void *priv, unsigned int offset, int size) { - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; switch (size) { case 1: @@ -330,33 +353,25 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; + return ULONG_MAX; } um_pci_bar_copy_from(priv, data, offset, size); switch (size) { case 1: - ret = data[0]; - break; + return data[0]; case 2: - ret = le16_to_cpup((void *)data); - break; + return le16_to_cpup((void *)data); case 4: - ret = le32_to_cpup((void *)data); - break; + return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; + return le64_to_cpup((void *)data); #endif default: - break; + return ULONG_MAX; } - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; } static void um_pci_bar_copy_to(void *priv, unsigned int offset, @@ -523,11 +538,8 @@ static void um_pci_cmd_vq_cb(struct virtqueue *vq) if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) return; - while ((cmd = virtqueue_get_buf(vq, &len))) { - if (WARN_ON(HANDLE_IS_NO_FREE(cmd))) - continue; - kfree(cmd); - } + while ((cmd = virtqueue_get_buf(vq, &len))) + um_pci_free_buf(dev, cmd); } static void um_pci_irq_vq_cb(struct virtqueue *vq) @@ -1006,10 +1018,6 @@ static int __init um_pci_init(void) "No virtio device ID configured for PCI - no PCI support\n")) return 0; - um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer); - if (!um_pci_msg_bufs) - return -ENOMEM; - bridge = pci_alloc_host_bridge(0); if (!bridge) { err = -ENOMEM; @@ -1070,7 +1078,6 @@ static int __init um_pci_init(void) pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); } - free_percpu(um_pci_msg_bufs); return err; } module_init(um_pci_init); @@ -1082,6 +1089,5 @@ static void __exit um_pci_exit(void) irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); - free_percpu(um_pci_msg_bufs); } module_exit(um_pci_exit); -- GitLab From daa1a05ba431540097ec925d4e01d53ef29a98f1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:05 +0100 Subject: [PATCH 497/989] um: virtio_uml: use raw spinlock This is needed because at least in time-travel the code can be called directly from the deep architecture and IRQ handling code. Link: https://patch.msgid.link/20250110125550.32479-7-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virtio_uml.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 65df43fa9be58..ad8d78fb1d9aa 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -52,7 +52,7 @@ struct virtio_uml_device { struct platform_device *pdev; struct virtio_uml_platform_data *pdata; - spinlock_t sock_lock; + raw_spinlock_t sock_lock; int sock, req_fd, irq; u64 features; u64 protocol_features; @@ -246,7 +246,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; - spin_lock_irqsave(&vu_dev->sock_lock, flags); + raw_spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) goto out; @@ -266,7 +266,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, } out: - spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + raw_spin_unlock_irqrestore(&vu_dev->sock_lock, flags); return rc; } @@ -1239,7 +1239,7 @@ static int virtio_uml_probe(struct platform_device *pdev) goto error_free; vu_dev->sock = rc; - spin_lock_init(&vu_dev->sock_lock); + raw_spin_lock_init(&vu_dev->sock_lock); rc = vhost_user_init(vu_dev); if (rc) -- GitLab From 96178631c3f53398044ed437010f7632ad764bf8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jan 2025 13:54:06 +0100 Subject: [PATCH 498/989] um: convert irq_lock to raw spinlock Since this is deep in the architecture, and the code is called nested into other deep management code, this really needs to be a raw spinlock. Convert it. Link: https://patch.msgid.link/20250110125550.32479-8-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/irq.c | 79 ++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 338450741aac5..a4991746f5eac 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -52,7 +52,7 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; @@ -257,7 +257,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -265,7 +265,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -286,17 +285,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -304,9 +305,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -316,11 +318,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; + } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -339,12 +352,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -358,19 +370,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -386,12 +399,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -402,7 +416,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -414,9 +428,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -546,7 +561,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -579,7 +594,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -588,7 +603,7 @@ void um_irqs_resume(void) unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -602,7 +617,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -613,7 +628,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -628,7 +643,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else -- GitLab From 2b4fc4cd43f28e9e39179c8702e6ee821258584d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Feb 2025 15:52:54 -0700 Subject: [PATCH 499/989] io_uring/waitid: setup async data in the prep handler This is the idiomatic way that opcodes should setup their async data, so that it's always valid inside ->issue() without issue needing to do that. Fixes: f31ecf671ddc4 ("io_uring: add IORING_OP_WAITID support") Signed-off-by: Jens Axboe --- io_uring/waitid.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index c4096d93a2870..15a7daf3ff4f3 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -285,10 +285,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; + iwa = io_uring_alloc_async_data(NULL, req); + if (!unlikely(iwa)) + return -ENOMEM; + iwa->req = req; + iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); @@ -299,16 +305,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; - struct io_waitid_async *iwa; int ret; - iwa = io_uring_alloc_async_data(NULL, req); - if (!iwa) - return -ENOMEM; - - iwa->req = req; - ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) -- GitLab From 960a62877466067adc89bd37fe36d3b6edddb965 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 10 Feb 2025 18:18:29 -0500 Subject: [PATCH 500/989] drm/amdgpu/pm: fix UVD handing in amdgpu_dpm_set_powergating_by_smu() UVD and VCN were split into separate dpm helpers in commit ff69bba05f08 ("drm/amd/pm: add inst to dpm_set_powergating_by_smu") as such, there is no need to include UVD in the is_vcn variable since UVD and VCN are handled by separate dpm helpers now. Fix the check. Fixes: ff69bba05f08 ("drm/amd/pm: add inst to dpm_set_powergating_by_smu") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3959 Link: https://lists.freedesktop.org/archives/amd-gfx/2025-February/119827.html Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: Boyuan Zhang --- drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 6a9e26905edfc..7a22aef6e59c3 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -78,7 +78,7 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, int ret = 0; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; enum ip_power_state pwr_state = gate ? POWER_STATE_OFF : POWER_STATE_ON; - bool is_vcn = (block_type == AMD_IP_BLOCK_TYPE_UVD || block_type == AMD_IP_BLOCK_TYPE_VCN); + bool is_vcn = block_type == AMD_IP_BLOCK_TYPE_VCN; if (atomic_read(&adev->pm.pwr_state[block_type]) == pwr_state && (!is_vcn || adev->vcn.num_vcn_inst == 1)) { -- GitLab From 9cf6b84b71adb97f3c19476ebb5a42228fad89b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:12:31 -0400 Subject: [PATCH 501/989] bcachefs: CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS Incorrectly handled transaction restarts can be a source of heisenbugs; add a mode where we randomly inject them to shake them out. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 7 +++++++ fs/bcachefs/btree_iter.c | 33 +++++++++++++++++++++++++++++++- fs/bcachefs/btree_iter.h | 12 ++++++++++++ fs/bcachefs/btree_trans_commit.c | 4 ++++ fs/bcachefs/btree_types.h | 3 +++ 5 files changed, 58 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 85eea7a4dea30..fc7efd0a7525e 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -61,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5988219c69084..e32fce4fd2583 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) @@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { @@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); @@ -3163,7 +3185,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: p = trans->mem + trans->mem_top; @@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + trans_set_locked(trans, false); if (trans->restarted) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b9538e6e6d65e..8c16d9a3ec1d8 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err) return btree_trans_restart_ip(trans, err, _THIS_IP_); } +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; +} + bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 2760dd9569ed9..c4f524b2ca9a0 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_unlocked_or_in_restart(trans); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a6f251eb41646..a09cbe9cd94f1 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -509,6 +509,9 @@ struct btree_trans { bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; -- GitLab From 531323a2efc3fbe20b540e3f41ecc94d68e74b76 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Thu, 13 Feb 2025 02:11:01 +0800 Subject: [PATCH 502/989] bcachefs: Pass _orig_restart_count to trans_was_restarted _orig_restart_count is unused now, according to the logic, trans_was_restarted should be using _orig_restart_count. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 8c16d9a3ec1d8..b96157f3dc9c7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -751,7 +751,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ -- GitLab From 406e445b3c6be65ab0ee961145e74bfd7ef6c9e1 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Wed, 12 Feb 2025 17:27:51 +0800 Subject: [PATCH 503/989] bcachefs: Reuse transaction bch2_nocow_write_convert_unwritten is already in transaction context: 00191 ========= TEST generic/648 00242 kernel BUG at fs/bcachefs/btree_iter.c:3332! 00242 Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP 00242 Modules linked in: 00242 CPU: 4 UID: 0 PID: 2593 Comm: fsstress Not tainted 6.13.0-rc3-ktest-g345af8f855b7 #14403 00242 Hardware name: linux,dummy-virt (DT) 00242 pstate: 60001005 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00242 pc : __bch2_trans_get+0x120/0x410 00242 lr : __bch2_trans_get+0xcc/0x410 00242 sp : ffffff80d89af600 00242 x29: ffffff80d89af600 x28: ffffff80ddb23000 x27: 00000000fffff705 00242 x26: ffffff80ddb23028 x25: ffffff80d8903fe0 x24: ffffff80ebb30168 00242 x23: ffffff80c8aeb500 x22: 000000000000005d x21: ffffff80d8904078 00242 x20: ffffff80d8900000 x19: ffffff80da9e8000 x18: 0000000000000000 00242 x17: 64747568735f6c61 x16: 6e72756f6a20726f x15: 0000000000000028 00242 x14: 0000000000000004 x13: 000000000000f787 x12: ffffffc081bbcdc8 00242 x11: 0000000000000000 x10: 0000000000000003 x9 : ffffffc08094efbc 00242 x8 : 000000001092c111 x7 : 000000000000000c x6 : ffffffc083c31fc4 00242 x5 : ffffffc083c31f28 x4 : ffffff80c8aeb500 x3 : ffffff80ebb30000 00242 x2 : 0000000000000001 x1 : 0000000000000a21 x0 : 000000000000028e 00242 Call trace: 00242 __bch2_trans_get+0x120/0x410 (P) 00242 bch2_inum_offset_err_msg+0x48/0xb0 00242 bch2_nocow_write_convert_unwritten+0x3d0/0x530 00242 bch2_nocow_write+0xeb0/0x1000 00242 __bch2_write+0x330/0x4e8 00242 bch2_write+0x1f0/0x530 00242 bch2_direct_write+0x530/0xc00 00242 bch2_write_iter+0x160/0xbe0 00242 vfs_write+0x1cc/0x360 00242 ksys_write+0x5c/0xf0 00242 __arm64_sys_write+0x20/0x30 00242 invoke_syscall.constprop.0+0x54/0xe8 00242 do_el0_svc+0x44/0xc0 00242 el0_svc+0x34/0xa0 00242 el0t_64_sync_handler+0x104/0x130 00242 el0t_64_sync+0x154/0x158 00242 Code: 6b01001f 54ffff01 79408460 3617fec0 (d4210000) 00242 ---[ end trace 0000000000000000 ]--- 00242 Kernel panic - not syncing: Oops - BUG: Fatal exception Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dd508d93e9fc8..03892388832b6 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) __bch2_write_op_error(out, op, op->pos.offset); } +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); -- GitLab From b35eb9128ebeec534eed1cefd6b9b1b7282cf5ba Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 28 Jan 2025 11:55:22 -0500 Subject: [PATCH 504/989] drm/amdgpu/gfx9: manually control gfxoff for CS on RV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mesa started using compute queues more often we started seeing additional hangs with compute queues. Disabling gfxoff seems to mitigate that. Manually control gfxoff and gfx pg with command submissions to avoid any issues related to gfxoff. KFD already does the same thing for these chips. v2: limit to compute v3: limit to APUs v4: limit to Raven/PCO v5: only update the compute ring_funcs v6: Disable GFX PG v7: adjust order Reviewed-by: Lijo Lazar Suggested-by: Błażej Szczygieł Suggested-by: Sergey Kovalenko Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3861 Link: https://lists.freedesktop.org/archives/amd-gfx/2025-January/119116.html Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 36 +++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index fa572b40989e3..0dce4421418c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -7437,6 +7437,38 @@ static void gfx_v9_0_ring_emit_cleaner_shader(struct amdgpu_ring *ring) amdgpu_ring_write(ring, 0); /* RESERVED field, programmed to zero */ } +static void gfx_v9_0_ring_begin_use_compute(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_ip_block *gfx_block = + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); + + amdgpu_gfx_enforce_isolation_ring_begin_use(ring); + + /* Raven and PCO APUs seem to have stability issues + * with compute and gfxoff and gfx pg. Disable gfx pg during + * submission and allow again afterwards. + */ + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_UNGATE); +} + +static void gfx_v9_0_ring_end_use_compute(struct amdgpu_ring *ring) +{ + struct amdgpu_device *adev = ring->adev; + struct amdgpu_ip_block *gfx_block = + amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); + + /* Raven and PCO APUs seem to have stability issues + * with compute and gfxoff and gfx pg. Disable gfx pg during + * submission and allow again afterwards. + */ + if (gfx_block && amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 1, 0)) + gfx_v9_0_set_powergating_state(gfx_block, AMD_PG_STATE_GATE); + + amdgpu_gfx_enforce_isolation_ring_end_use(ring); +} + static const struct amd_ip_funcs gfx_v9_0_ip_funcs = { .name = "gfx_v9_0", .early_init = gfx_v9_0_early_init, @@ -7613,8 +7645,8 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { .emit_wave_limit = gfx_v9_0_emit_wave_limit, .reset = gfx_v9_0_reset_kcq, .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader, - .begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use, - .end_use = amdgpu_gfx_enforce_isolation_ring_end_use, + .begin_use = gfx_v9_0_ring_begin_use_compute, + .end_use = gfx_v9_0_ring_end_use_compute, }; static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { -- GitLab From 55ed2b1b50d029dd7e49a35f6628ca64db6d75d8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 31 Jan 2025 13:53:40 -0500 Subject: [PATCH 505/989] drm/amdgpu: bump version for RV/PCO compute fix Bump the driver version for RV/PCO compute stability fix so mesa can use this check to enable compute queues on RV/PCO. Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index dce9323fb410c..95a05b03f799d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -120,9 +120,10 @@ * - 3.58.0 - Add GFX12 DCC support * - 3.59.0 - Cleared VRAM * - 3.60.0 - Add AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE (Vulkan requirement) + * - 3.61.0 - Contains fix for RV/PCO compute queues */ #define KMS_DRIVER_MAJOR 3 -#define KMS_DRIVER_MINOR 60 +#define KMS_DRIVER_MINOR 61 #define KMS_DRIVER_PATCHLEVEL 0 /* -- GitLab From a33f7f9660705fb2ecf3467b2c48965564f392ce Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Sun, 26 Jan 2025 17:21:10 +0800 Subject: [PATCH 506/989] amdkfd: properly free gang_ctx_bo when failed to init user queue The destructor of a gtt bo is declared as void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj); Which takes void** as the second parameter. GCC allows passing void* to the function because void* can be implicitly casted to any other types, so it can pass compiling. However, passing this void* parameter into the function's execution process(which expects void** and dereferencing void**) will result in errors. Signed-off-by: Zhu Lingshan Reviewed-by: Felix Kuehling Fixes: fb91065851cd ("drm/amdkfd: Refactor queue wptr_bo GART mapping") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index bcddd989c7f39..bd36a75309e12 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -300,7 +300,7 @@ static int init_user_queue(struct process_queue_manager *pqm, return 0; free_gang_ctx_bo: - amdgpu_amdkfd_free_gtt_mem(dev->adev, (*q)->gang_ctx_bo); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &(*q)->gang_ctx_bo); cleanup: uninit_queue(*q); *q = NULL; -- GitLab From a0a455b4bc7483ad60e8b8a50330c1e05bb7bfcf Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Feb 2025 14:28:51 +0800 Subject: [PATCH 507/989] drm/amdgpu: bail out when failed to load fw in psp_init_cap_microcode() In function psp_init_cap_microcode(), it should bail out when failed to load firmware, otherwise it may cause invalid memory access. Fixes: 07dbfc6b102e ("drm/amd: Use `amdgpu_ucode_*` helpers for PSP") Reviewed-by: Lijo Lazar Signed-off-by: Jiang Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index babe94ade2478..e5fc80ed06eae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3815,9 +3815,10 @@ int psp_init_cap_microcode(struct psp_context *psp, const char *chip_name) if (err == -ENODEV) { dev_warn(adev->dev, "cap microcode does not exist, skip\n"); err = 0; - goto out; + } else { + dev_err(adev->dev, "fail to initialize cap microcode\n"); } - dev_err(adev->dev, "fail to initialize cap microcode\n"); + goto out; } info = &adev->firmware.ucode[AMDGPU_UCODE_ID_CAP]; -- GitLab From d584198a6fe4c51f4aa88ad72f258f8961a0f11c Mon Sep 17 00:00:00 2001 From: Lancelot SIX Date: Tue, 28 Jan 2025 19:16:49 +0000 Subject: [PATCH 508/989] drm/amdkfd: Ensure consistent barrier state saved in gfx12 trap handler It is possible for some waves in a workgroup to finish their save sequence before the group leader has had time to capture the workgroup barrier state. When this happens, having those waves exit do impact the barrier state. As a consequence, the state captured by the group leader is invalid, and is eventually incorrectly restored. This patch proposes to have all waves in a workgroup wait for each other at the end of their save sequence (just before calling s_endpgm_saved). Signed-off-by: Lancelot SIX Reviewed-by: Jay Cornwall Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.12.x --- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 3 ++- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 984f0e7050788..651660958e5b1 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -4121,7 +4121,8 @@ static const uint32_t cwsr_trap_gfx12_hex[] = { 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, 0xb97af804, 0xbe804ec2, 0xbf94fffe, - 0xbe804a6c, 0xbfb10000, + 0xbe804a6c, 0xbe804ec2, + 0xbf94fffe, 0xbfb10000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0x00000000, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm index 1740e98c6719d..7b9d36e5fa437 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx12.asm @@ -1049,6 +1049,10 @@ L_SKIP_BARRIER_RESTORE: s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: + // Make sure that no wave of the workgroup can exit the trap handler + // before the workgroup barrier state is saved. + s_barrier_signal -2 + s_barrier_wait -2 s_endpgm_saved end -- GitLab From 1abb2648698bf10783d2236a6b4a7ca5e8021699 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Feb 2025 14:44:14 +0800 Subject: [PATCH 509/989] drm/amdgpu: avoid buffer overflow attach in smu_sys_set_pp_table() It malicious user provides a small pptable through sysfs and then a bigger pptable, it may cause buffer overflow attack in function smu_sys_set_pp_table(). Reviewed-by: Lijo Lazar Signed-off-by: Jiang Liu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 8ca793c222ff2..ed9dac00ebfb1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -612,7 +612,8 @@ static int smu_sys_set_pp_table(void *handle, return -EIO; } - if (!smu_table->hardcode_pptable) { + if (!smu_table->hardcode_pptable || smu_table->power_play_table_size < size) { + kfree(smu_table->hardcode_pptable); smu_table->hardcode_pptable = kzalloc(size, GFP_KERNEL); if (!smu_table->hardcode_pptable) return -ENOMEM; -- GitLab From 15d6f74f03f84c5b8d032bb1be6b90af82e5b679 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 10 Feb 2025 10:24:55 -0300 Subject: [PATCH 510/989] MAINTAINERS: Add sctp headers to the general netdev entry All SCTP patches are picked up by netdev maintainers. Two headers were missing to be listed there. Reported-by: Thorsten Blum Signed-off-by: Marcelo Ricardo Leitner Reviewed-by: Simon Horman Link: https://patch.msgid.link/b3c2dc3a102eb89bd155abca2503ebd015f50ee0.1739193671.git.marcelo.leitner@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5f9535f4f1964..b182021f4906e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16508,6 +16508,7 @@ F: include/linux/netdev* F: include/linux/netlink.h F: include/linux/netpoll.h F: include/linux/rtnetlink.h +F: include/linux/sctp.h F: include/linux/seq_file_net.h F: include/linux/skbuff* F: include/net/ @@ -16524,6 +16525,7 @@ F: include/uapi/linux/netdev* F: include/uapi/linux/netlink.h F: include/uapi/linux/netlink_diag.h F: include/uapi/linux/rtnetlink.h +F: include/uapi/linux/sctp.h F: lib/net_utils.c F: lib/random32.c F: net/ -- GitLab From 78dafe1cf3afa02ed71084b350713b07e72a18fb Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 10 Feb 2025 13:15:00 +0100 Subject: [PATCH 511/989] vsock: Orphan socket after transport release During socket release, sock_orphan() is called without considering that it sets sk->sk_wq to NULL. Later, if SO_LINGER is enabled, this leads to a null pointer dereferenced in virtio_transport_wait_close(). Orphan the socket only after transport release. Partially reverts the 'Fixes:' commit. KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] lock_acquire+0x19e/0x500 _raw_spin_lock_irqsave+0x47/0x70 add_wait_queue+0x46/0x230 virtio_transport_release+0x4e7/0x7f0 __vsock_release+0xfd/0x490 vsock_release+0x90/0x120 __sock_release+0xa3/0x250 sock_close+0x14/0x20 __fput+0x35e/0xa90 __x64_sys_close+0x78/0xd0 do_syscall_64+0x93/0x1b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Reported-by: syzbot+9d55b199192a4be7d02c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9d55b199192a4be7d02c Fixes: fcdd2242c023 ("vsock: Keep the binding until socket destruction") Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250210-vsock-linger-nullderef-v3-1-ef6244d02b54@rbox.co Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 075695173648d..53a081d49d28a 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -824,13 +824,19 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); - sock_orphan(sk); + /* Indicate to vsock_remove_sock() that the socket is being released and + * can be removed from the bound_table. Unlike transport reassignment + * case, where the socket must remain bound despite vsock_remove_sock() + * being called from the transport release() callback. + */ + sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); + sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); -- GitLab From 440c9d488705366b00372ea7213af69827a6c7af Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 10 Feb 2025 13:15:01 +0100 Subject: [PATCH 512/989] vsock/test: Add test for SO_LINGER null ptr deref Explicitly close() a TCP_ESTABLISHED (connectible) socket with SO_LINGER enabled. As for now, test does not verify if close() actually lingers. On an unpatched machine, may trigger a null pointer dereference. Tested-by: Luigi Leonardi Reviewed-by: Luigi Leonardi Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250210-vsock-linger-nullderef-v3-2-ef6244d02b54@rbox.co Signed-off-by: Jakub Kicinski --- tools/testing/vsock/vsock_test.c | 41 ++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index dfff8b288265f..d0f6d253ac72d 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1788,6 +1788,42 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) close(fd); } +static void test_stream_linger_client(const struct test_opts *opts) +{ + struct linger optval = { + .l_onoff = 1, + .l_linger = 1 + }; + int fd; + + fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); + if (fd < 0) { + perror("connect"); + exit(EXIT_FAILURE); + } + + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &optval, sizeof(optval))) { + perror("setsockopt(SO_LINGER)"); + exit(EXIT_FAILURE); + } + + close(fd); +} + +static void test_stream_linger_server(const struct test_opts *opts) +{ + int fd; + + fd = vsock_stream_accept(VMADDR_CID_ANY, opts->peer_port, NULL); + if (fd < 0) { + perror("accept"); + exit(EXIT_FAILURE); + } + + vsock_wait_remote_close(fd); + close(fd); +} + static struct test_case test_cases[] = { { .name = "SOCK_STREAM connection reset", @@ -1943,6 +1979,11 @@ static struct test_case test_cases[] = { .run_client = test_stream_connect_retry_client, .run_server = test_stream_connect_retry_server, }, + { + .name = "SOCK_STREAM SO_LINGER null-ptr-deref", + .run_client = test_stream_linger_client, + .run_server = test_stream_linger_server, + }, {}, }; -- GitLab From edb1942542bc538707cea221e9c7923a6270465f Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 13 Feb 2025 12:02:35 +0800 Subject: [PATCH 513/989] LoongArch: Fix idle VS timer enqueue LoongArch re-enables interrupts on its idle routine and performs a TIF_NEED_RESCHED check afterwards before putting the CPU to sleep. The IRQs firing between the check and the idle instruction may set the TIF_NEED_RESCHED flag. In order to deal with such a race, IRQs interrupting __arch_cpu_idle() rollback their return address to the beginning of __arch_cpu_idle() so that TIF_NEED_RESCHED is checked again before going back to sleep. However idle IRQs can also queue timers that may require a tick reprogramming through a new generic idle loop iteration but those timers would go unnoticed here because __arch_cpu_idle() only checks TIF_NEED_RESCHED. It doesn't check for pending timers. Fix this with fast-forwarding idle IRQs return address to the end of the idle routine instead of the beginning, so that the generic idle loop can handle both TIF_NEED_RESCHED and pending timers. Fixes: 0603839b18f4 ("LoongArch: Add exception/interrupt handling") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Marco Crivellari Signed-off-by: Huacai Chen --- arch/loongarch/kernel/genex.S | 28 +++++++++++++++------------- arch/loongarch/kernel/idle.c | 3 +-- arch/loongarch/kernel/reset.c | 6 +++--- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 86d5d90ebefe5..4f09121417818 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -18,16 +18,19 @@ .align 5 SYM_FUNC_START(__arch_cpu_idle) - /* start of rollback region */ - LONG_L t0, tp, TI_FLAGS - nop - andi t0, t0, _TIF_NEED_RESCHED - bnez t0, 1f - nop - nop - nop + /* start of idle interrupt region */ + ori t0, zero, CSR_CRMD_IE + /* idle instruction needs irq enabled */ + csrxchg t0, t0, LOONGARCH_CSR_CRMD + /* + * If an interrupt lands here; between enabling interrupts above and + * going idle on the next instruction, we must *NOT* go idle since the + * interrupt could have set TIF_NEED_RESCHED or caused an timer to need + * reprogramming. Fall through -- see handle_vint() below -- and have + * the idle loop take care of things. + */ idle 0 - /* end of rollback region */ + /* end of idle interrupt region */ 1: jr ra SYM_FUNC_END(__arch_cpu_idle) @@ -35,11 +38,10 @@ SYM_CODE_START(handle_vint) UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL - la_abs t1, __arch_cpu_idle + la_abs t1, 1b LONG_L t0, sp, PT_ERA - /* 32 byte rollback region */ - ori t0, t0, 0x1f - xori t0, t0, 0x1f + /* 3 instructions idle interrupt region */ + ori t0, t0, 0b1100 bne t0, t1, 1f LONG_S t0, sp, PT_ERA 1: move a0, sp diff --git a/arch/loongarch/kernel/idle.c b/arch/loongarch/kernel/idle.c index 0b5dd2faeb90b..54b247d8cdb69 100644 --- a/arch/loongarch/kernel/idle.c +++ b/arch/loongarch/kernel/idle.c @@ -11,7 +11,6 @@ void __cpuidle arch_cpu_idle(void) { - raw_local_irq_enable(); - __arch_cpu_idle(); /* idle instruction needs irq enabled */ + __arch_cpu_idle(); raw_local_irq_disable(); } diff --git a/arch/loongarch/kernel/reset.c b/arch/loongarch/kernel/reset.c index 1ef8c63835351..de8fa5a8a825c 100644 --- a/arch/loongarch/kernel/reset.c +++ b/arch/loongarch/kernel/reset.c @@ -33,7 +33,7 @@ void machine_halt(void) console_flush_on_panic(CONSOLE_FLUSH_PENDING); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -53,7 +53,7 @@ void machine_power_off(void) #endif while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } @@ -74,6 +74,6 @@ void machine_restart(char *command) acpi_reboot(); while (true) { - __arch_cpu_idle(); + __asm__ __volatile__("idle 0" : : : "memory"); } } -- GitLab From 619b52777a4972bdb6ddf86ac54c6f68a47b51c4 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 13 Feb 2025 12:02:35 +0800 Subject: [PATCH 514/989] LoongArch: Fix kernel_page_present() for KPRANGE/XKPRANGE Now kernel_page_present() always return true for KPRANGE/XKPRANGE addresses, this isn't correct because hibernation (ACPI S4) use it to distinguish whether a page is saveable. If all KPRANGE/XKPRANGE addresses are considered as saveable, then reserved memory such as EFI_RUNTIME_SERVICES_CODE / EFI_RUNTIME_SERVICES_DATA will also be saved and restored. Fix this by returning true only if the KPRANGE/XKPRANGE address is in memblock.memory. Signed-off-by: Huacai Chen --- arch/loongarch/mm/pageattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index bf86782484440..99165903908a4 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -3,6 +3,7 @@ * Copyright (C) 2024 Loongson Technology Corporation Limited */ +#include #include #include #include @@ -167,7 +168,7 @@ bool kernel_page_present(struct page *page) unsigned long addr = (unsigned long)page_address(page); if (addr < vm_map_base) - return true; + return memblock_is_memory(__pa(addr)); pgd = pgd_offset_k(addr); if (pgd_none(pgdp_get(pgd))) -- GitLab From 03a99d16e64fad41c8d39700bef9b0ac9c4e148b Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Thu, 13 Feb 2025 12:02:35 +0800 Subject: [PATCH 515/989] LoongArch: Use str_yes_no() helper function for /proc/cpuinfo Remove hard-coded strings by using the str_yes_no() helper function. Similar to commit c4a0a4a45a45 ("MIPS: kernel: proc: Use str_yes_no() helper function"). Co-developed-by: Wentao Guan Signed-off-by: Wentao Guan Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/proc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 6ce46d92f1f19..56a230238c9cf 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -91,10 +91,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (cpu_has_lbt_mips) seq_printf(m, " lbt_mips"); seq_printf(m, "\n"); - seq_printf(m, "Hardware Watchpoint\t: %s", - cpu_has_watch ? "yes, " : "no\n"); + seq_printf(m, "Hardware Watchpoint\t: %s", str_yes_no(cpu_has_watch)); if (cpu_has_watch) { - seq_printf(m, "iwatch count: %d, dwatch count: %d\n", + seq_printf(m, ", iwatch count: %d, dwatch count: %d", cpu_data[n].watch_ireg_count, cpu_data[n].watch_dreg_count); } @@ -104,7 +103,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) raw_notifier_call_chain(&proc_cpuinfo_chain, 0, &proc_cpuinfo_notifier_args); - seq_printf(m, "\n"); + seq_printf(m, "\n\n"); return 0; } -- GitLab From 6b72cd9ef062702390fc96c469beea1729a5dffe Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Thu, 13 Feb 2025 12:02:40 +0800 Subject: [PATCH 516/989] LoongArch: Remove the deprecated notifier hook mechanism The notifier hook mechanism in proc and cpuinfo is actually unnecessary for LoongArch because it's not used anywhere. It was originally added to the MIPS code in commit d6d3c9afaab4 ("MIPS: MT: proc: Add support for printing VPE and TC ids"), and LoongArch then inherited it. But as the kernel code stands now, this notifier hook mechanism doesn't really make sense for either LoongArch or MIPS. In addition, the seq_file forward declaration needs to be moved to its proper place, as only the show_ipi_list() function in smp.c requires it. Co-developed-by: Wentao Guan Signed-off-by: Wentao Guan Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/cpu-info.h | 21 --------------------- arch/loongarch/include/asm/smp.h | 2 ++ arch/loongarch/kernel/proc.c | 22 ---------------------- 3 files changed, 2 insertions(+), 43 deletions(-) diff --git a/arch/loongarch/include/asm/cpu-info.h b/arch/loongarch/include/asm/cpu-info.h index 35e0a230a4849..7f5bc0ad9d505 100644 --- a/arch/loongarch/include/asm/cpu-info.h +++ b/arch/loongarch/include/asm/cpu-info.h @@ -76,27 +76,6 @@ extern const char *__cpu_full_name[]; #define cpu_family_string() __cpu_family[raw_smp_processor_id()] #define cpu_full_name_string() __cpu_full_name[raw_smp_processor_id()] -struct seq_file; -struct notifier_block; - -extern int register_proc_cpuinfo_notifier(struct notifier_block *nb); -extern int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v); - -#define proc_cpuinfo_notifier(fn, pri) \ -({ \ - static struct notifier_block fn##_nb = { \ - .notifier_call = fn, \ - .priority = pri \ - }; \ - \ - register_proc_cpuinfo_notifier(&fn##_nb); \ -}) - -struct proc_cpuinfo_notifier_args { - struct seq_file *m; - unsigned long n; -}; - static inline bool cpus_are_siblings(int cpua, int cpub) { struct cpuinfo_loongarch *infoa = &cpu_data[cpua]; diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index 3383c9d24e942..b87d1d5e58905 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -77,6 +77,8 @@ extern int __cpu_logical_map[NR_CPUS]; #define SMP_IRQ_WORK BIT(ACTION_IRQ_WORK) #define SMP_CLEAR_VECTOR BIT(ACTION_CLEAR_VECTOR) +struct seq_file; + struct secondary_data { unsigned long stack; unsigned long thread_info; diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c index 56a230238c9cf..cea30768ae92a 100644 --- a/arch/loongarch/kernel/proc.c +++ b/arch/loongarch/kernel/proc.c @@ -13,28 +13,12 @@ #include #include -/* - * No lock; only written during early bootup by CPU 0. - */ -static RAW_NOTIFIER_HEAD(proc_cpuinfo_chain); - -int __ref register_proc_cpuinfo_notifier(struct notifier_block *nb) -{ - return raw_notifier_chain_register(&proc_cpuinfo_chain, nb); -} - -int proc_cpuinfo_notifier_call_chain(unsigned long val, void *v) -{ - return raw_notifier_call_chain(&proc_cpuinfo_chain, val, v); -} - static int show_cpuinfo(struct seq_file *m, void *v) { unsigned long n = (unsigned long) v - 1; unsigned int isa = cpu_data[n].isa_level; unsigned int version = cpu_data[n].processor_id & 0xff; unsigned int fp_version = cpu_data[n].fpu_vers; - struct proc_cpuinfo_notifier_args proc_cpuinfo_notifier_args; #ifdef CONFIG_SMP if (!cpu_online(n)) @@ -97,12 +81,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) cpu_data[n].watch_ireg_count, cpu_data[n].watch_dreg_count); } - proc_cpuinfo_notifier_args.m = m; - proc_cpuinfo_notifier_args.n = n; - - raw_notifier_call_chain(&proc_cpuinfo_chain, 0, - &proc_cpuinfo_notifier_args); - seq_printf(m, "\n\n"); return 0; -- GitLab From 6287f1a8c16138c2ec750953e35039634018c84a Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Thu, 13 Feb 2025 12:02:40 +0800 Subject: [PATCH 517/989] LoongArch: csum: Fix OoB access in IP checksum code for negative lengths Commit 69e3a6aa6be2 ("LoongArch: Add checksum optimization for 64-bit system") would cause an undefined shift and an out-of-bounds read. Commit 8bd795fedb84 ("arm64: csum: Fix OoB access in IP checksum code for negative lengths") fixes the same issue on ARM64. Fixes: 69e3a6aa6be2 ("LoongArch: Add checksum optimization for 64-bit system") Co-developed-by: Wentao Guan Signed-off-by: Wentao Guan Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/lib/csum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c index a5e84b403c3b3..df309ae4045de 100644 --- a/arch/loongarch/lib/csum.c +++ b/arch/loongarch/lib/csum.c @@ -25,7 +25,7 @@ unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len) const u64 *ptr; u64 data, sum64 = 0; - if (unlikely(len == 0)) + if (unlikely(len <= 0)) return 0; offset = (unsigned long)buff & 7; -- GitLab From bdb13252e5d1518823b81f458d9975c85d5240c2 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 13 Feb 2025 12:02:56 +0800 Subject: [PATCH 518/989] LoongArch: KVM: Fix typo issue about GCFG feature detection This is typo issue and misusage about GCFG feature macro. The code is wrong, only that it does not cause obvious problem since GCFG is set again on vCPU context switch. Fixes: 0d0df3c99d4f ("LoongArch: KVM: Implement kvm hardware enable, disable interface") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index bf9268bf26d5b..f6d3242b9234a 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -303,9 +303,9 @@ int kvm_arch_enable_virtualization_cpu(void) * TOE=0: Trap on Exception. * TIT=0: Trap on Timer. */ - if (env & CSR_GCFG_GCIP_ALL) + if (env & CSR_GCFG_GCIP_SECURE) gcfg |= CSR_GCFG_GCI_SECURE; - if (env & CSR_GCFG_MATC_ROOT) + if (env & CSR_GCFG_MATP_ROOT) gcfg |= CSR_GCFG_MATC_ROOT; write_csr_gcfg(gcfg); -- GitLab From d8cc4fee3f8ad21f83326ec8a6d200e04c8f0a00 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 13 Feb 2025 12:02:56 +0800 Subject: [PATCH 519/989] LoongArch: KVM: Remove duplicated cache attribute setting Cache attribute comes from GPA->HPA secondary mmu page table and is configured when kvm is enabled. It is the same for all VMs, so remove duplicated cache attribute setting on vCPU context switch. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index fb72095c8077e..20f941af3e9ea 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1548,9 +1548,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Restore timer state regardless */ kvm_restore_timer(vcpu); - - /* Control guest page CCA attribute */ - change_csr_gcfg(CSR_GCFG_MATC_MASK, CSR_GCFG_MATC_ROOT); kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); /* Restore hardware PMU CSRs */ -- GitLab From 3011b29ec5a33ec16502e687c4264d57416a8b1f Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 13 Feb 2025 12:02:56 +0800 Subject: [PATCH 520/989] LoongArch: KVM: Set host with kernel mode when switch to VM mode PRMD register is only meaningful on the beginning stage of exception entry, and it is overwritten with nested irq or exception. When CPU runs in VM mode, interrupt need be enabled on host. And the mode for host had better be kernel mode rather than random or user mode. When VM is running, the running mode with top command comes from CRMD register, and running mode should be kernel mode since kernel function is executing with perf command. It needs be consistent with both top and perf command. Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/switch.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 0c292f8184927..1be185e948072 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -85,7 +85,7 @@ * Guest CRMD comes from separate GCSR_CRMD register */ ori t0, zero, CSR_PRMD_PIE - csrxchg t0, t0, LOONGARCH_CSR_PRMD + csrwr t0, LOONGARCH_CSR_PRMD /* Set PVM bit to setup ertn to guest context */ ori t0, zero, CSR_GSTAT_PVM -- GitLab From 5db843258de1e4e6b1ef1cbd1797923c9e3de548 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:15 +0200 Subject: [PATCH 521/989] net: ethernet: ti: am65-cpsw: fix memleak in certain XDP cases If the XDP program doesn't result in XDP_PASS then we leak the memory allocated by am65_cpsw_build_skb(). It is pointless to allocate SKB memory before running the XDP program as we would be wasting CPU cycles for cases other than XDP_PASS. Move the SKB allocation after evaluating the XDP program result. This fixes the memleak. A performance boost is seen for XDP_DROP test. XDP_DROP test: Before: 460256 rx/s 0 err/s After: 784130 rx/s 0 err/s Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Roger Quadros Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-1-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index b663271e79f72..021c5a4df8fd4 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -842,7 +842,8 @@ static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) static struct sk_buff *am65_cpsw_build_skb(void *page_addr, struct net_device *ndev, - unsigned int len) + unsigned int len, + unsigned int headroom) { struct sk_buff *skb; @@ -852,7 +853,7 @@ static struct sk_buff *am65_cpsw_build_skb(void *page_addr, if (unlikely(!skb)) return NULL; - skb_reserve(skb, AM65_CPSW_HEADROOM); + skb_reserve(skb, headroom); skb->dev = ndev; return skb; @@ -1315,16 +1316,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, dev_dbg(dev, "%s rx csum_info:%#x\n", __func__, csum_info); dma_unmap_single(rx_chn->dma_dev, buf_dma, buf_dma_len, DMA_FROM_DEVICE); - k3_cppi_desc_pool_free(rx_chn->desc_pool, desc_rx); - skb = am65_cpsw_build_skb(page_addr, ndev, - AM65_CPSW_MAX_PACKET_SIZE); - if (unlikely(!skb)) { - new_page = page; - goto requeue; - } - if (port->xdp_prog) { xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, @@ -1334,9 +1327,16 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; - /* Compute additional headroom to be reserved */ - headroom = (xdp.data - xdp.data_hard_start) - skb_headroom(skb); - skb_reserve(skb, headroom); + headroom = xdp.data - xdp.data_hard_start; + } else { + headroom = AM65_CPSW_HEADROOM; + } + + skb = am65_cpsw_build_skb(page_addr, ndev, + AM65_CPSW_MAX_PACKET_SIZE, headroom); + if (unlikely(!skb)) { + new_page = page; + goto requeue; } ndev_priv = netdev_priv(ndev); -- GitLab From 8a9f82ff15da03a6804cdd6557fb36ff71c0924f Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:16 +0200 Subject: [PATCH 522/989] net: ethernet: ti: am65-cpsw: fix RX & TX statistics for XDP_TX case For successful XDP_TX and XDP_REDIRECT cases, the packet was received successfully so update RX statistics. Use original received packet length for that. TX packets statistics are incremented on TX completion so don't update it while TX queueing. If xdp_convert_buff_to_frame() fails, increment tx_dropped. Signed-off-by: Roger Quadros Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-2-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 021c5a4df8fd4..38f3be310928d 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1170,9 +1170,11 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct xdp_frame *xdpf; struct bpf_prog *prog; struct page *page; + int pkt_len; u32 act; int err; + pkt_len = *len; prog = READ_ONCE(port->xdp_prog); if (!prog) return AM65_CPSW_XDP_PASS; @@ -1190,8 +1192,10 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, netif_txq = netdev_get_tx_queue(ndev, tx_chn->id); xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) + if (unlikely(!xdpf)) { + ndev->stats.tx_dropped++; goto drop; + } __netif_tx_lock(netif_txq, cpu); err = am65_cpsw_xdp_tx_frame(ndev, tx_chn, xdpf, @@ -1200,14 +1204,14 @@ static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, if (err) goto drop; - dev_sw_netstats_tx_add(ndev, 1, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_CONSUMED; goto out; case XDP_REDIRECT: if (unlikely(xdp_do_redirect(ndev, xdp, prog))) goto drop; - dev_sw_netstats_rx_add(ndev, *len); + dev_sw_netstats_rx_add(ndev, pkt_len); ret = AM65_CPSW_XDP_REDIRECT; goto out; default: -- GitLab From 4542536f664f752db5feba2c5998b165933c34f2 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Mon, 10 Feb 2025 16:52:17 +0200 Subject: [PATCH 523/989] net: ethernet: ti: am65_cpsw: fix tx_cleanup for XDP case For XDP transmit case, swdata doesn't contain SKB but the XDP Frame. Infer the correct swdata based on buffer type and return the XDP Frame for XDP transmit case. Signed-off-by: Roger Quadros Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Link: https://patch.msgid.link/20250210-am65-cpsw-xdp-fixes-v1-3-ec6b1f7f1aca@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 38f3be310928d..2806238629f82 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -828,16 +828,24 @@ static void am65_cpsw_nuss_xmit_free(struct am65_cpsw_tx_chn *tx_chn, static void am65_cpsw_nuss_tx_cleanup(void *data, dma_addr_t desc_dma) { struct am65_cpsw_tx_chn *tx_chn = data; + enum am65_cpsw_tx_buf_type buf_type; struct cppi5_host_desc_t *desc_tx; + struct xdp_frame *xdpf; struct sk_buff *skb; void **swdata; desc_tx = k3_cppi_desc_pool_dma2virt(tx_chn->desc_pool, desc_dma); swdata = cppi5_hdesc_get_swdata(desc_tx); - skb = *(swdata); - am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); + buf_type = am65_cpsw_nuss_buf_type(tx_chn, desc_dma); + if (buf_type == AM65_CPSW_TX_BUF_TYPE_SKB) { + skb = *(swdata); + dev_kfree_skb_any(skb); + } else { + xdpf = *(swdata); + xdp_return_frame(xdpf); + } - dev_kfree_skb_any(skb); + am65_cpsw_nuss_xmit_free(tx_chn, desc_tx); } static struct sk_buff *am65_cpsw_build_skb(void *page_addr, -- GitLab From e00a2e5d485faf53c7a24b9d1b575a642227947f Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 12 Feb 2025 18:18:51 +0200 Subject: [PATCH 524/989] drm: Fix DSC BPP increment decoding Starting with DPCD version 2.0 bits 6:3 of the DP_DSC_BITS_PER_PIXEL_INC DPCD register contains the NativeYCbCr422_MAX_bpp_DELTA field, which can be non-zero as opposed to earlier DPCD versions, hence decoding the bit_per_pixel increment value at bits 2:0 in the same register requires applying a mask, do so. Cc: Ankit Nautiyal Fixes: 0c2287c96521 ("drm/display/dp: Add helper function to get DSC bpp precision") Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250212161851.4007005-1-imre.deak@intel.com --- drivers/gpu/drm/display/drm_dp_helper.c | 2 +- include/drm/display/drm_dp.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c index da3c8521a7fa7..61c7c2c588c6e 100644 --- a/drivers/gpu/drm/display/drm_dp_helper.c +++ b/drivers/gpu/drm/display/drm_dp_helper.c @@ -2544,7 +2544,7 @@ u8 drm_dp_dsc_sink_bpp_incr(const u8 dsc_dpcd[DP_DSC_RECEIVER_CAP_SIZE]) { u8 bpp_increment_dpcd = dsc_dpcd[DP_DSC_BITS_PER_PIXEL_INC - DP_DSC_SUPPORT]; - switch (bpp_increment_dpcd) { + switch (bpp_increment_dpcd & DP_DSC_BITS_PER_PIXEL_MASK) { case DP_DSC_BITS_PER_PIXEL_1_16: return 16; case DP_DSC_BITS_PER_PIXEL_1_8: diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h index a6f8b098c56f1..3bd9f482f0c3e 100644 --- a/include/drm/display/drm_dp.h +++ b/include/drm/display/drm_dp.h @@ -359,6 +359,7 @@ # define DP_DSC_BITS_PER_PIXEL_1_4 0x2 # define DP_DSC_BITS_PER_PIXEL_1_2 0x3 # define DP_DSC_BITS_PER_PIXEL_1_1 0x4 +# define DP_DSC_BITS_PER_PIXEL_MASK 0x7 #define DP_PSR_SUPPORT 0x070 /* XXX 1.2? */ # define DP_PSR_IS_SUPPORTED 1 -- GitLab From d923782b041218ef3804b2fed87619b5b1a497f3 Mon Sep 17 00:00:00 2001 From: Beata Michalska Date: Fri, 31 Jan 2025 15:58:42 +0000 Subject: [PATCH 525/989] arm64: amu: Delay allocating cpumask for AMU FIE support For the time being, the amu_fie_cpus cpumask is being exclusively used by the AMU-related internals of FIE support and is guaranteed to be valid on every access currently made. Still the mask is not being invalidated on one of the error handling code paths, which leaves a soft spot with theoretical risk of UAF for CPUMASK_OFFSTACK cases. To make things sound, delay allocating said cpumask (for CPUMASK_OFFSTACK) avoiding otherwise nasty sanitising case failing to register the cpufreq policy notifications. Signed-off-by: Beata Michalska Reviewed-by: Prasanna Kumar T S M Reviewed-by: Sumit Gupta Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20250131155842.3839098-1-beata.michalska@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 1a2c72f3e7f80..cb180684d10d5 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -194,12 +194,19 @@ static void amu_fie_setup(const struct cpumask *cpus) int cpu; /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_available(amu_fie_cpus) && + unlikely(cpumask_subset(cpus, amu_fie_cpus))) return; - for_each_cpu(cpu, cpus) { + for_each_cpu(cpu, cpus) if (!freq_counters_valid(cpu)) return; + + if (!cpumask_available(amu_fie_cpus) && + !zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { + WARN_ONCE(1, "Failed to allocate FIE cpumask for CPUs[%*pbl]\n", + cpumask_pr_args(cpus)); + return; } cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); @@ -237,17 +244,8 @@ static struct notifier_block init_amu_fie_notifier = { static int __init init_amu_fie(void) { - int ret; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) - return -ENOMEM; - - ret = cpufreq_register_notifier(&init_amu_fie_notifier, + return cpufreq_register_notifier(&init_amu_fie_notifier, CPUFREQ_POLICY_NOTIFIER); - if (ret) - free_cpumask_var(amu_fie_cpus); - - return ret; } core_initcall(init_amu_fie); -- GitLab From f818227a2f3d1d4f26469347e428323d61cc83f0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 28 Jan 2025 00:17:49 +0000 Subject: [PATCH 526/989] ACPI: GTDT: Relax sanity checking on Platform Timers array count Perhaps unsurprisingly there are some platforms where the GTDT isn't quite right and the Platforms Timer array overflows the length of the overall table. While the recently-added sanity checking isn't wrong, it makes it impossible to boot the kernel on offending platforms. Try to hobble along and limit the Platform Timer count to the bounds of the table. Cc: Marc Zyngier Cc: Lorenzo Pieralisi Cc: Zheng Zengkai Cc: stable@vger.kernel.org Fixes: 263e22d6bd1f ("ACPI: GTDT: Tighten the check for the array of platform timer structures") Signed-off-by: Oliver Upton Acked-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20250128001749.3132656-1-oliver.upton@linux.dev Signed-off-by: Will Deacon --- drivers/acpi/arm64/gtdt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 3561553eff8b5..70f8290b659de 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -163,7 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, { void *platform_timer; struct acpi_table_gtdt *gtdt; - int cnt = 0; + u32 cnt = 0; gtdt = container_of(table, struct acpi_table_gtdt, header); acpi_gtdt_desc.gtdt = gtdt; @@ -188,13 +188,17 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, cnt++; if (cnt != gtdt->platform_timer_count) { + cnt = min(cnt, gtdt->platform_timer_count); + pr_err(FW_BUG "limiting Platform Timer count to %d\n", cnt); + } + + if (!cnt) { acpi_gtdt_desc.platform_timer = NULL; - pr_err(FW_BUG "invalid timer data.\n"); - return -EINVAL; + return 0; } if (platform_timer_count) - *platform_timer_count = gtdt->platform_timer_count; + *platform_timer_count = cnt; return 0; } -- GitLab From a4cc8494f1d853a0945d2a655b4891935d717355 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 12 Feb 2025 00:30:42 +0000 Subject: [PATCH 527/989] arm64: Add missing registrations of hwcaps Commit 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") added definitions for HWCAP_FPRCVT, HWCAP_F8MM8 and HWCAP_F8MM4 but did not include the crucial registration in arm64_elf_hwcaps. Add it. Fixes: 819935464cb2 ("arm64/hwcap: Describe 2024 dpISA extensions to userspace") Reported-by: Mark Rutland Signed-off-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20250212-arm64-fix-2024-dpisa-v2-1-67a1c11d6001@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f0910f20fbf8c..d561cf3b8ac7b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3091,6 +3091,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM), HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2), HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG), + HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT), HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP), HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP), HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD), @@ -3190,6 +3191,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8), + HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3), HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2), #ifdef CONFIG_ARM64_POE -- GitLab From ed975485a13d1f6080218aa71c29425ba2dfb332 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 11 Feb 2025 18:22:30 +0000 Subject: [PATCH 528/989] MIPS: Export syscall stack arguments properly for remote use We have several places across the kernel where we want to access another task's syscall arguments, such as ptrace(2), seccomp(2), etc., by making a call to syscall_get_arguments(). This works for register arguments right away by accessing the task's `regs' member of `struct pt_regs', however for stack arguments seen with 32-bit/o32 kernels things are more complicated. Technically they ought to be obtained from the user stack with calls to an access_remote_vm(), but we have an easier way available already. So as to be able to access syscall stack arguments as regular function arguments following the MIPS calling convention we copy them over from the user stack to the kernel stack in arch/mips/kernel/scall32-o32.S, in handle_sys(), to the current stack frame's outgoing argument space at the top of the stack, which is where the handler called expects to see its incoming arguments. This area is also pointed at by the `pt_regs' pointer obtained by task_pt_regs(). Make the o32 stack argument space a proper member of `struct pt_regs' then, by renaming the existing member from `pad0' to `args' and using generated offsets to access the space. No functional change though. With the change in place the o32 kernel stack frame layout at the entry to a syscall handler invoked by handle_sys() is therefore as follows: $sp + 68 -> | ... | <- pt_regs.regs[9] +---------------------+ $sp + 64 -> | $t0 | <- pt_regs.regs[8] +---------------------+ $sp + 60 -> | $a3/argument #4 | <- pt_regs.regs[7] +---------------------+ $sp + 56 -> | $a2/argument #3 | <- pt_regs.regs[6] +---------------------+ $sp + 52 -> | $a1/argument #2 | <- pt_regs.regs[5] +---------------------+ $sp + 48 -> | $a0/argument #1 | <- pt_regs.regs[4] +---------------------+ $sp + 44 -> | $v1 | <- pt_regs.regs[3] +---------------------+ $sp + 40 -> | $v0 | <- pt_regs.regs[2] +---------------------+ $sp + 36 -> | $at | <- pt_regs.regs[1] +---------------------+ $sp + 32 -> | $zero | <- pt_regs.regs[0] +---------------------+ $sp + 28 -> | stack argument #8 | <- pt_regs.args[7] +---------------------+ $sp + 24 -> | stack argument #7 | <- pt_regs.args[6] +---------------------+ $sp + 20 -> | stack argument #6 | <- pt_regs.args[5] +---------------------+ $sp + 16 -> | stack argument #5 | <- pt_regs.args[4] +---------------------+ $sp + 12 -> | psABI space for $a3 | <- pt_regs.args[3] +---------------------+ $sp + 8 -> | psABI space for $a2 | <- pt_regs.args[2] +---------------------+ $sp + 4 -> | psABI space for $a1 | <- pt_regs.args[1] +---------------------+ $sp + 0 -> | psABI space for $a0 | <- pt_regs.args[0] +---------------------+ holding user data received and with the first 4 frame slots reserved by the psABI for the compiler to spill the incoming arguments from $a0-$a3 registers (which it sometimes does according to its needs) and the next 4 frame slots designated by the psABI for any stack function arguments that follow. This data is also available for other tasks to peek/poke at as reqired and where permitted. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/ptrace.h | 4 ++-- arch/mips/kernel/asm-offsets.c | 6 ++++++ arch/mips/kernel/scall32-o32.S | 8 ++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 4a2b40ce39e09..85fa9962266a2 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -27,8 +27,8 @@ */ struct pt_regs { #ifdef CONFIG_32BIT - /* Pad bytes for argument save space on the stack. */ - unsigned long pad0[8]; + /* Saved syscall stack arguments; entries 0-3 unused. */ + unsigned long args[8]; #endif /* Saved main processor registers. */ diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index cb1045ebab062..b910ec54a3a17 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -27,6 +27,12 @@ void output_ptreg_defines(void); void output_ptreg_defines(void) { COMMENT("MIPS pt_regs offsets."); +#ifdef CONFIG_32BIT + OFFSET(PT_ARG4, pt_regs, args[4]); + OFFSET(PT_ARG5, pt_regs, args[5]); + OFFSET(PT_ARG6, pt_regs, args[6]); + OFFSET(PT_ARG7, pt_regs, args[7]); +#endif OFFSET(PT_R0, pt_regs, regs[0]); OFFSET(PT_R1, pt_regs, regs[1]); OFFSET(PT_R2, pt_regs, regs[2]); diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 2c604717e6308..4947a4f39e371 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -64,10 +64,10 @@ load_a6: user_lw(t7, 24(t0)) # argument #7 from usp load_a7: user_lw(t8, 28(t0)) # argument #8 from usp loads_done: - sw t5, 16(sp) # argument #5 to ksp - sw t6, 20(sp) # argument #6 to ksp - sw t7, 24(sp) # argument #7 to ksp - sw t8, 28(sp) # argument #8 to ksp + sw t5, PT_ARG4(sp) # argument #5 to ksp + sw t6, PT_ARG5(sp) # argument #6 to ksp + sw t7, PT_ARG6(sp) # argument #7 to ksp + sw t8, PT_ARG7(sp) # argument #8 to ksp .set pop .section __ex_table,"a" -- GitLab From 733a90561ad0a4a74035d2d627098da85d43b592 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Wed, 12 Feb 2025 01:02:09 +0200 Subject: [PATCH 529/989] MIPS: fix mips_get_syscall_arg() for o32 This makes ptrace/get_syscall_info selftest pass on mips o32 and mips64 o32 by fixing the following two test assertions: 1. get_syscall_info test assertion on mips o32: # get_syscall_info.c:218:get_syscall_info:Expected exp_args[5] (3134521044) == info.entry.args[4] (4911432) # get_syscall_info.c:219:get_syscall_info:wait #1: entry stop mismatch 2. get_syscall_info test assertion on mips64 o32: # get_syscall_info.c:209:get_syscall_info:Expected exp_args[2] (3134324433) == info.entry.args[1] (18446744072548908753) # get_syscall_info.c:210:get_syscall_info:wait #1: entry stop mismatch The first assertion happens due to mips_get_syscall_arg() trying to access another task's context but failing to do it properly because get_user() it calls just peeks at the current task's context. It usually does not crash because the default user stack always gets assigned the same VMA, but it is pure luck which mips_get_syscall_arg() wouldn't have if e.g. the stack was switched (via setcontext(3) or however) or a non-default process's thread peeked at, and in any case irrelevant data is obtained just as observed with the test case. mips_get_syscall_arg() ought to be using access_remote_vm() instead to retrieve the other task's stack contents, but given that the data has been already obtained and saved in `struct pt_regs' it would be an overkill. The first assertion is fixed for mips o32 by using struct pt_regs.args instead of get_user() to obtain syscall arguments. This approach works due to this piece in arch/mips/kernel/scall32-o32.S: /* * Ok, copy the args from the luser stack to the kernel stack. */ .set push .set noreorder .set nomacro load_a4: user_lw(t5, 16(t0)) # argument #5 from usp load_a5: user_lw(t6, 20(t0)) # argument #6 from usp load_a6: user_lw(t7, 24(t0)) # argument #7 from usp load_a7: user_lw(t8, 28(t0)) # argument #8 from usp loads_done: sw t5, PT_ARG4(sp) # argument #5 to ksp sw t6, PT_ARG5(sp) # argument #6 to ksp sw t7, PT_ARG6(sp) # argument #7 to ksp sw t8, PT_ARG7(sp) # argument #8 to ksp .set pop .section __ex_table,"a" PTR_WD load_a4, bad_stack_a4 PTR_WD load_a5, bad_stack_a5 PTR_WD load_a6, bad_stack_a6 PTR_WD load_a7, bad_stack_a7 .previous arch/mips/kernel/scall64-o32.S has analogous code for mips64 o32 that allows fixing the issue by obtaining syscall arguments from struct pt_regs.regs[4..11] instead of the erroneous use of get_user(). The second assertion is fixed by truncating 64-bit values to 32-bit syscall arguments. Fixes: c0ff3c53d4f9 ("MIPS: Enable HAVE_ARCH_TRACEHOOK.") Signed-off-by: Dmitry V. Levin Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/syscall.h | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index ebdf4d910af2f..056aa1b713e23 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -57,37 +57,21 @@ static inline void mips_syscall_update_nr(struct task_struct *task, static inline void mips_get_syscall_arg(unsigned long *arg, struct task_struct *task, struct pt_regs *regs, unsigned int n) { - unsigned long usp __maybe_unused = regs->regs[29]; - +#ifdef CONFIG_32BIT switch (n) { case 0: case 1: case 2: case 3: *arg = regs->regs[4 + n]; - - return; - -#ifdef CONFIG_32BIT - case 4: case 5: case 6: case 7: - get_user(*arg, (int *)usp + n); return; -#endif - -#ifdef CONFIG_64BIT case 4: case 5: case 6: case 7: -#ifdef CONFIG_MIPS32_O32 - if (test_tsk_thread_flag(task, TIF_32BIT_REGS)) - get_user(*arg, (int *)usp + n); - else -#endif - *arg = regs->regs[4 + n]; - + *arg = regs->args[n]; return; -#endif - - default: - BUG(); } - - unreachable(); +#else + *arg = regs->regs[4 + n]; + if ((IS_ENABLED(CONFIG_MIPS32_O32) && + test_tsk_thread_flag(task, TIF_32BIT_REGS))) + *arg = (unsigned int)*arg; +#endif } static inline long syscall_get_error(struct task_struct *task, -- GitLab From 446a8351f160d65a1c5df7097f31c74102ed2bb1 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 10 Feb 2025 17:37:32 +0100 Subject: [PATCH 530/989] arm64: rust: clean Rust 1.85.0 warning using softfloat target Starting with Rust 1.85.0 (to be released 2025-02-20), `rustc` warns [1] about disabling neon in the aarch64 hardfloat target: warning: target feature `neon` cannot be toggled with `-Ctarget-feature`: unsound on hard-float targets because it changes float ABI | = note: this was previously accepted by the compiler but is being phased out; it will become a hard error in a future release! = note: for more information, see issue #116344 Thus, instead, use the softfloat target instead. While trying it out, I found that the kernel sanitizers were not enabled for that built-in target [2]. Upstream Rust agreed to backport the enablement for the current beta so that it is ready for the 1.85.0 release [3] -- thanks! However, that still means that before Rust 1.85.0, we cannot switch since sanitizers could be in use. Thus conditionally do so. Cc: stable@vger.kernel.org # Needed in 6.12.y and 6.13.y only (Rust is pinned in older LTSs). Cc: Catalin Marinas Cc: Will Deacon Cc: Matthew Maurer Cc: Alice Ryhl Cc: Ralf Jung Cc: Jubilee Young Link: https://github.com/rust-lang/rust/pull/133417 [1] Link: https://rust-lang.zulipchat.com/#narrow/channel/131828-t-compiler/topic/arm64.20neon.20.60-Ctarget-feature.60.20warning/near/495358442 [2] Link: https://github.com/rust-lang/rust/pull/135905 [3] Link: https://github.com/rust-lang/rust/issues/116344 Signed-off-by: Miguel Ojeda Reviewed-by: Trevor Gross Tested-by: Matthew Maurer Reviewed-by: Ralf Jung Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250210163732.281786-1-ojeda@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 358c68565bfd0..2b25d671365f2 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -48,7 +48,11 @@ KBUILD_CFLAGS += $(CC_FLAGS_NO_FPU) \ KBUILD_CFLAGS += $(call cc-disable-warning, psabi) KBUILD_AFLAGS += $(compat_vdso) +ifeq ($(call test-ge, $(CONFIG_RUSTC_VERSION), 108500),y) +KBUILD_RUSTFLAGS += --target=aarch64-unknown-none-softfloat +else KBUILD_RUSTFLAGS += --target=aarch64-unknown-none -Ctarget-feature="-neon" +endif KBUILD_CFLAGS += $(call cc-option,-mabi=lp64) KBUILD_AFLAGS += $(call cc-option,-mabi=lp64) -- GitLab From 85fcb57c983f423180ba6ec5d0034242da05cc54 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 10 Feb 2025 08:43:39 +0100 Subject: [PATCH 531/989] xen/swiotlb: relax alignment requirements When mapping a buffer for DMA via .map_page or .map_sg DMA operations, there is no need to check the machine frames to be aligned according to the mapped areas size. All what is needed in these cases is that the buffer is contiguous at machine level. So carve out the alignment check from range_straddles_page_boundary() and move it to a helper called by xen_swiotlb_alloc_coherent() and xen_swiotlb_free_coherent() directly. Fixes: 9f40ec84a797 ("xen/swiotlb: add alignment check for dma buffers") Reported-by: Jan Vejvalka Tested-by: Jan Vejvalka Signed-off-by: Juergen Gross Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index a337edcf8faf7..26c62e0d34e98 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -74,19 +74,21 @@ static inline phys_addr_t xen_dma_to_phys(struct device *dev, return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); } +static inline bool range_requires_alignment(phys_addr_t p, size_t size) +{ + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); + phys_addr_t bus_addr = pfn_to_bfn(XEN_PFN_DOWN(p)) << XEN_PAGE_SHIFT; + + return IS_ALIGNED(p, algn) && !IS_ALIGNED(bus_addr, algn); +} + static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); - phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); - /* If buffer is physically aligned, ensure DMA alignment. */ - if (IS_ALIGNED(p, algn) && - !IS_ALIGNED((phys_addr_t)next_bfn << XEN_PAGE_SHIFT, algn)) - return 1; - for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; @@ -156,7 +158,8 @@ xen_swiotlb_alloc_coherent(struct device *dev, size_t size, *dma_handle = xen_phys_to_dma(dev, phys); if (*dma_handle + size - 1 > dma_mask || - range_straddles_page_boundary(phys, size)) { + range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size)) { if (xen_create_contiguous_region(phys, order, fls64(dma_mask), dma_handle) != 0) goto out_free_pages; @@ -182,7 +185,8 @@ xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, size = ALIGN(size, XEN_PAGE_SIZE); if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || - WARN_ON_ONCE(range_straddles_page_boundary(phys, size))) + WARN_ON_ONCE(range_straddles_page_boundary(phys, size) || + range_requires_alignment(phys, size))) return; if (TestClearPageXenRemapped(virt_to_page(vaddr))) -- GitLab From e93ec87286bd1fd30b7389e7a387cfb259f297e3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 11 Feb 2025 11:16:28 +0100 Subject: [PATCH 532/989] x86/xen: allow larger contiguous memory regions in PV guests Today a PV guest (including dom0) can create 2MB contiguous memory regions for DMA buffers at max. This has led to problems at least with the megaraid_sas driver, which wants to allocate a 2.3MB DMA buffer. The limiting factor is the frame array used to do the hypercall for making the memory contiguous, which has 512 entries and is just a static array in mmu_pv.c. In order to not waste memory for non-PV guests, put the initial frame array into .init.data section and dynamically allocate an array from the .init_after_bootmem hook of PV guests. In case a contiguous memory area larger than the initially supported 2MB is requested, allocate a larger buffer for the frame list. Note that such an allocation is tried only after memory management has been initialized properly, which is tested via a flag being set in the .init_after_bootmem hook. Fixes: 9f40ec84a797 ("xen/swiotlb: add alignment check for dma buffers") Signed-off-by: Juergen Gross Tested-by: Alan Robinson Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/mmu_pv.c | 71 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2c70cd35e72c5..d078de2c952b3 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -111,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -814,6 +859,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -2203,10 +2251,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) - return -ENOMEM; + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; + + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); @@ -2358,12 +2409,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2371,6 +2422,8 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ in_frame = virt_to_mfn((void *)vstart); -- GitLab From 75ad02318af2e4ae669e26a79f001bd5e1f97472 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 12 Feb 2025 16:14:38 +0100 Subject: [PATCH 533/989] Xen/swiotlb: mark xen_swiotlb_fixup() __init It's sole user (pci_xen_swiotlb_init()) is __init, too. Signed-off-by: Jan Beulich Reviewed-by: Stefano Stabellini Message-ID: Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 26c62e0d34e98..1f65795cf5d7a 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -113,7 +113,7 @@ static struct io_tlb_pool *xen_swiotlb_find_pool(struct device *dev, } #ifdef CONFIG_X86 -int xen_swiotlb_fixup(void *buf, unsigned long nslabs) +int __init xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int rc; unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); -- GitLab From 4cf7d58620bfc2ebe934e3dfa97208f13f14ab8b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Sun, 9 Feb 2025 09:46:50 +0530 Subject: [PATCH 534/989] genirq: Remove unused CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS CONFIG_GENERIC_PENDING_IRQ_CHIPFLAGS is not used anymore, hence remove it. Fixes: f94a18249b7f ("genirq: Remove IRQ_MOVE_PCNTXT and related code") Signed-off-by: Anup Patel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250209041655.331470-7-apatel@ventanamicro.com --- kernel/irq/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0feaf..875f25ed6f710 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK config GENERIC_PENDING_IRQ bool -# Deduce delayed migration from top-level interrupt chip flags -config GENERIC_PENDING_IRQ_CHIPFLAGS - bool - # Support for generic irq migrating off cpu before the cpu is offline. config GENERIC_IRQ_MIGRATION bool -- GitLab From ab027c488fc4a1fff0a5b712d4bdb2d2d324e8f8 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 23 Jan 2025 14:34:41 +0800 Subject: [PATCH 535/989] firmware: arm_scmi: imx: Correct tx size of scmi_imx_misc_ctrl_set 'struct scmi_imx_misc_ctrl_set_in' has a zero length array in the end, The sizeof will not count 'value[]', and hence Tx size will be smaller than actual size for Tx,and SCMI firmware will flag this as protocol error. Fix this by enlarge the Tx size with 'num * sizeof(__le32)' to count in the size of data. Fixes: 61c9f03e22fc ("firmware: arm_scmi: Add initial support for i.MX MISC protocol") Reviewed-by: Jacky Bai Tested-by: Shengjiu Wang Acked-by: Jason Liu Signed-off-by: Peng Fan Message-Id: <20250123063441.392555-1-peng.fan@oss.nxp.com> (sudeep.holla: Commit rewording and replace hardcoded sizeof(__le32) value) Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c b/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c index 83b69fc4fba5b..a8915d3b4df51 100644 --- a/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c +++ b/drivers/firmware/arm_scmi/vendors/imx/imx-sm-misc.c @@ -254,8 +254,8 @@ static int scmi_imx_misc_ctrl_set(const struct scmi_protocol_handle *ph, if (num > max_num) return -EINVAL; - ret = ph->xops->xfer_get_init(ph, SCMI_IMX_MISC_CTRL_SET, sizeof(*in), - 0, &t); + ret = ph->xops->xfer_get_init(ph, SCMI_IMX_MISC_CTRL_SET, + sizeof(*in) + num * sizeof(__le32), 0, &t); if (ret) return ret; -- GitLab From 81f64e925c29fe6e99f04b131fac1935ac931e81 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 12 Feb 2025 13:35:16 -0600 Subject: [PATCH 536/989] PCI: Avoid FLR for Mediatek MT7922 WiFi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Mediatek MT7922 WiFi device advertises FLR support, but it apparently does not work, and all subsequent config reads return ~0: pci 0000:01:00.0: [14c3:0616] type 00 class 0x028000 PCIe Endpoint pciback 0000:01:00.0: not ready 65535ms after FLR; giving up After an FLR, pci_dev_wait() waits for the device to become ready. Prior to d591f6804e7e ("PCI: Wait for device readiness with Configuration RRS"), it polls PCI_COMMAND until it is something other that PCI_POSSIBLE_ERROR (~0). If it times out, pci_dev_wait() returns -ENOTTY and __pci_reset_function_locked() tries the next available reset method. Typically this is Secondary Bus Reset, which does work, so the MT7922 is eventually usable. After d591f6804e7e, if Configuration Request Retry Status Software Visibility (RRS SV) is enabled, pci_dev_wait() polls PCI_VENDOR_ID until it is something other than the special 0x0001 Vendor ID that indicates a completion with RRS status. When RRS SV is enabled, reads of PCI_VENDOR_ID should return either 0x0001, i.e., the config read was completed with RRS, or a valid Vendor ID. On the MT7922, it seems that all config reads after FLR return ~0 indefinitely. When pci_dev_wait() reads PCI_VENDOR_ID and gets 0xffff, it assumes that's a valid Vendor ID and the device is now ready, so it returns with success. After pci_dev_wait() returns success, we restore config space and continue. Since the MT7922 is not actually ready after the FLR, the restore fails and the device is unusable. We considered changing pci_dev_wait() to continue polling if a PCI_VENDOR_ID read returns either 0x0001 or 0xffff. This "works" as it did before d591f6804e7e, although we have to wait for the timeout and then fall back to SBR. But it doesn't work for SR-IOV VFs, which *always* return 0xffff as the Vendor ID. Mark Mediatek MT7922 WiFi devices to avoid the use of FLR completely. This will cause fallback to another reset method, such as SBR. Link: https://lore.kernel.org/r/20250212193516.88741-1-helgaas@kernel.org Fixes: d591f6804e7e ("PCI: Wait for device readiness with Configuration RRS") Link: https://github.com/QubesOS/qubes-issues/issues/9689#issuecomment-2582927149 Link: https://lore.kernel.org/r/Z4pHll_6GX7OUBzQ@mail-itl Signed-off-by: Bjorn Helgaas Tested-by: Marek Marczykowski-Górecki Cc: stable@vger.kernel.org --- drivers/pci/quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index b84ff7bade822..82b21e34c545e 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -5522,7 +5522,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443, quirk_intel_qat_vf_cap); * AMD Matisse USB 3.0 Host Controller 0x149c * Intel 82579LM Gigabit Ethernet Controller 0x1502 * Intel 82579V Gigabit Ethernet Controller 0x1503 - * + * Mediatek MT7922 802.11ax PCI Express Wireless Network Adapter */ static void quirk_no_flr(struct pci_dev *dev) { @@ -5534,6 +5534,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x149c, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x7901, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_no_flr); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_no_flr); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_MEDIATEK, 0x0616, quirk_no_flr); /* FLR may cause the SolidRun SNET DPU (rev 0x1) to hang */ static void quirk_no_flr_snet(struct pci_dev *dev) -- GitLab From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:23:59 +0000 Subject: [PATCH 537/989] netfs: Fix a number of read-retry hangs Fix a number of hangs in the netfslib read-retry code, including: (1) netfs_reissue_read() doubles up the getting of references on subrequests, thereby leaking the subrequest and causing inode eviction to wait indefinitely. This can lead to the kernel reporting a hang in the filesystem's evict_inode(). Fix this by removing the get from netfs_reissue_read() and adding one to netfs_retry_read_subrequests() to deal with the one place that didn't double up. (2) The loop in netfs_retry_read_subrequests() that retries a sequence of failed subrequests doesn't record whether or not it retried the one that the "subreq" pointer points to when it leaves the loop. It may not if renegotiation/repreparation of the subrequests means that fewer subrequests are needed to span the cumulative range of the sequence. Because it doesn't record this, the piece of code that discards now-superfluous subrequests doesn't know whether it should discard the one "subreq" points to - and so it doesn't. Fix this by noting whether the last subreq it examines is superfluous and if it is, then getting rid of it and all subsequent subrequests. If that one one wasn't superfluous, then we would have tried to go round the previous loop again and so there can be no further unretried subrequests in the sequence. (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional subrequests it has to get because it ran out of ones it could reuse to to renegotiation/repreparation shrinking the subrequests. Fix this by removing that extra ref. (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the sequence - but netfs_read_subreq_terminated() is now using a wait queue on the request instead and so this wait will never finish. Fix this by waiting on the wait queue instead. To make this work, a new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell the wake-up code to wake up the wait queue rather than requeuing the request's work item. Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is no longer used. (5) Whilst not strictly anything to do with the hang, netfs_retry_read_subrequests() was also doubly incrementing the subreq_counter and re-setting the debug index, leaving a gap in the trace. This is also fixed. One of these hangs was observed with 9p and with cifs. Others were forced by manual code injection into fs/afs/file.c. Firstly, afs_prepare_read() was created to provide an changing pattern of maximum subrequest sizes: static int afs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; if (!S_ISREG(subreq->rreq->inode->i_mode)) return 0; if (subreq->retry_count < 20) rreq->io_streams[0].sreq_max_len = umax(200, 2222 - subreq->retry_count * 40); else rreq->io_streams[0].sreq_max_len = 3333; return 0; } and pointed to by afs_req_ops. Then the following: struct netfs_io_subrequest *subreq = op->fetch.subreq; if (subreq->error == 0 && S_ISREG(subreq->rreq->inode->i_mode) && subreq->retry_count < 20) { subreq->transferred = subreq->already_done; __clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); afs_fetch_data_notify(op); return; } was inserted into afs_fetch_data_success() at the beginning and struct netfs_io_subrequest given an extra field, "already_done" that was set to the value in "subreq->transferred" by netfs_reissue_read(). When reading a 4K file, the subrequests would get gradually smaller, a new subrequest would be allocated around the 3rd retry and then eventually be rendered superfluous when the 20th retry was hit and the limit on the first subrequest was eased. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com Tested-by: Marc Dionne Tested-by: Steve French cc: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 6 ++++-- fs/netfs/read_retry.c | 40 +++++++++++++++++++++++++++--------- include/linux/netfs.h | 2 +- include/trace/events/netfs.h | 4 +++- 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index f65affa5a9e4a..636cc5a98ef57 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work) */ void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work)) @@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) netfs_wake_read_collector(rreq); netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 2290af0d51acc..8316c4533a51d 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,7 +14,6 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -48,6 +47,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); subreq->retry_count++; netfs_reset_iter(subreq); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); } } @@ -75,7 +75,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) struct iov_iter source; unsigned long long start, len; size_t part; - bool boundary = false; + bool boundary = false, subreq_superfluous = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the @@ -116,8 +116,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) + if (!len) { + subreq_superfluous = true; break; + } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; @@ -154,19 +156,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (subreq == to) + if (subreq == to) { + subreq_superfluous = false; break; + } } /* If we managed to use fewer subreqs, we can discard the * excess; if we used the same number, then we're done. */ if (!len) { - if (subreq == to) + if (!subreq_superfluous) continue; list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); if (subreq == to) @@ -187,14 +191,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = len; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); subreq->stream_nr = stream->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -256,14 +258,32 @@ void netfs_retry_reads(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); } + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 071d05d81d388..c86a11cfc4a36 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -278,7 +278,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ +#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6e699cadcb294..f880835f7695e 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -99,7 +99,7 @@ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_need_clear, "N-CLR") \ EM(netfs_sreq_trace_partial_read, "PARTR") \ - EM(netfs_sreq_trace_need_retry, "NRTRY") \ + EM(netfs_sreq_trace_need_retry, "ND-RT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_progress, "PRGRS") \ @@ -108,7 +108,9 @@ EM(netfs_sreq_trace_short, "SHORT") \ EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ + EM(netfs_sreq_trace_superfluous, "SPRFL") \ EM(netfs_sreq_trace_terminated, "TERM ") \ + EM(netfs_sreq_trace_wait_for, "_WAIT") \ EM(netfs_sreq_trace_write, "WRITE") \ EM(netfs_sreq_trace_write_skip, "SKIP ") \ E_(netfs_sreq_trace_write_term, "WTERM") -- GitLab From d01c495f432ce34df8bfd092e71720a2cf169a90 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:24:00 +0000 Subject: [PATCH 538/989] netfs: Add retry stat counters Add stat counters to count the number of request and subrequest retries and display them in /proc/fs/netfs/stats. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-3-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/internal.h | 4 ++++ fs/netfs/read_retry.c | 3 +++ fs/netfs/stats.c | 9 +++++++++ fs/netfs/write_issue.c | 1 + fs/netfs/write_retry.c | 2 ++ 5 files changed, 19 insertions(+) diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index eb76f98c894bb..1c4f953c3d683 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin; extern atomic_t netfs_n_rh_write_done; extern atomic_t netfs_n_rh_write_failed; extern atomic_t netfs_n_rh_write_zskip; +extern atomic_t netfs_n_rh_retry_read_req; +extern atomic_t netfs_n_rh_retry_read_subreq; extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; @@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wh_retry_write_req; +extern atomic_t netfs_n_wh_retry_write_subreq; extern atomic_t netfs_n_wb_lock_skip; extern atomic_t netfs_n_wb_lock_wait; extern atomic_t netfs_n_folioq; diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 8316c4533a51d..0f294b26e08c9 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,6 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_rh_retry_read_subreq); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -260,6 +261,8 @@ void netfs_retry_reads(struct netfs_io_request *rreq) struct netfs_io_stream *stream = &rreq->io_streams[0]; DEFINE_WAIT(myself); + netfs_stat(&netfs_n_rh_retry_read_req); + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index f1af344266cc6..ab6b916addc44 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin; atomic_t netfs_n_rh_write_done; atomic_t netfs_n_rh_write_failed; atomic_t netfs_n_rh_write_zskip; +atomic_t netfs_n_rh_retry_read_req; +atomic_t netfs_n_rh_retry_read_subreq; atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; @@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wh_retry_write_req; +atomic_t netfs_n_wh_retry_write_subreq; atomic_t netfs_n_wb_lock_skip; atomic_t netfs_n_wb_lock_wait; atomic_t netfs_n_folioq; @@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v) atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); + seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n", + atomic_read(&netfs_n_rh_retry_read_req), + atomic_read(&netfs_n_rh_retry_read_subreq), + atomic_read(&netfs_n_wh_retry_write_req), + atomic_read(&netfs_n_wh_retry_write_subreq)); seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 69727411683ef..77279fc5b5a7c 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream, subreq->retry_count++; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_stat(&netfs_n_wh_retry_write_subreq); netfs_do_issue_write(stream, subreq); } diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index c841a851dd73b..545d33079a77d 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq) struct netfs_io_stream *stream; int s; + netfs_stat(&netfs_n_wh_retry_write_req); + /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ -- GitLab From 5de0219a9bb9dacc4ce6e8f2745540dcce786983 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:24:01 +0000 Subject: [PATCH 539/989] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all subreqs queued Due to the code that queues a subreq on the active subrequest list getting moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set before the list-add actually happens. This is not a problem if the collection worker happens after the list-add, but it's a race - and, for 9P, where the read from the server is synchronous and done in the submitting thread, this is a lot more likely. The result is that, if the timing is wrong, a ref gets leaked because the collector thinks that all the subreqs have completed (because it can't see the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the collection worker no longer goes into the collector. This can be provoked with AFS by injecting an msleep() right before the final subreq is queued. Fix this by splitting the queuing part out of netfs_issue_read() into a new function, netfs_queue_read(), and calling it separately. The setting of NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is holding the spinlock (that's probably unnecessary, but shouldn't hurt). It might be better to set a flag on the final subreq, but this could be a problem if an error occurs and we can't queue it. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Reported-by: Ihor Solodrai Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-4-dhowells@redhat.com Tested-by: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index f761d44b34362..0d1b6d35ff3b8 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, netfs_cache_read_terminated, subreq); } -static void netfs_issue_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) +static void netfs_queue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + bool last_subreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, } } + if (last_subreq) { + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); + } + spin_unlock(&rreq->lock); +} +static void netfs_issue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ switch (subreq->source) { case NETFS_DOWNLOAD_FROM_SERVER: rreq->netfs_ops->issue_read(subreq); @@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) } size -= slice; start += slice; - if (size <= 0) { - smp_wmb(); /* Write lists before ALL_QUEUED. */ - set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - } + netfs_queue_read(rreq, subreq, size <= 0); netfs_issue_read(rreq, subreq); cond_resched(); } while (size > 0); -- GitLab From 1f47ed294a2bd577d5ae43e6e28e1c9a3be4a833 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:18:46 -0700 Subject: [PATCH 540/989] block: cleanup and fix batch completion adding conditions The conditions for whether or not a request is allowed adding to a completion batch are a bit hard to read, and they also have a few issues. One is that ioerror may indeed be a random value on passthrough, and it's being checked unconditionally of whether or not the given request is a passthrough request or not. Rewrite the conditions to be separate for easier reading, and only check ioerror for non-passthrough requests. This fixes an issue with bio unmapping on passthrough, where it fails getting added to a batch. This both leads to suboptimal performance, and may trigger a potential schedule-under-atomic condition for polled passthrough IO. Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()") Link: https://lore.kernel.org/r/20575f0a-656e-4bb3-9d82-dec6c7e3a35c@kernel.dk Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cdb..fa2a76cc2f73d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req, void (*complete)(struct io_comp_batch *)) { /* - * blk_mq_end_request_batch() can't end request allocated from - * sched tags + * Check various conditions that exclude batch processing: + * 1) No batch container + * 2) Has scheduler data attached + * 3) Not a passthrough request and end_io set + * 4) Not a passthrough request and an ioerror */ - if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || - (req->end_io && !blk_rq_is_passthrough(req))) + if (!iob) return false; + if (req->rq_flags & RQF_SCHED_TAGS) + return false; + if (!blk_rq_is_passthrough(req)) { + if (req->end_io) + return false; + if (ioerror < 0) + return false; + } if (!iob->complete) iob->complete = complete; -- GitLab From 6fe9116dd6bebee570406ec3f00a50388a62ccb3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 4 Feb 2025 13:52:36 +0200 Subject: [PATCH 541/989] MAINTAINERS: Use my kernel.org address for I2C ACPI work Switch to use my kernel.org address for I2C ACPI work. Signed-off-by: Mika Westerberg Signed-off-by: Wolfram Sang --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353de..1d200adbdcea4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10822,7 +10822,7 @@ S: Odd Fixes F: drivers/tty/hvc/ I2C ACPI SUPPORT -M: Mika Westerberg +M: Mika Westerberg L: linux-i2c@vger.kernel.org L: linux-acpi@vger.kernel.org S: Maintained -- GitLab From 35fa2d88ca9481e5caf533d58b99ca259c63b2fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Feb 2025 13:30:25 +0100 Subject: [PATCH 542/989] driver core: add a faux bus for use when a simple device/bus is needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many drivers abuse the platform driver/bus system as it provides a simple way to create and bind a device to a driver-specific set of probe/release functions. Instead of doing that, and wasting all of the memory associated with a platform device, here is a "faux" bus that can be used instead. Reviewed-by: Jonathan Cameron Reviewed-by: Danilo Krummrich Reviewed-by: Lyude Paul Reviewed-by: Thomas Weißschuh Reviewed-by: Zijun Hu Link: https://lore.kernel.org/r/2025021026-atlantic-gibberish-3f0c@gregkh Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/infrastructure.rst | 6 + drivers/base/Makefile | 2 +- drivers/base/base.h | 1 + drivers/base/faux.c | 232 ++++++++++++++++++++ drivers/base/init.c | 1 + include/linux/device/faux.h | 69 ++++++ 6 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 drivers/base/faux.c create mode 100644 include/linux/device/faux.h diff --git a/Documentation/driver-api/infrastructure.rst b/Documentation/driver-api/infrastructure.rst index 3d52dfdfa9fdf..35e36fee4238a 100644 --- a/Documentation/driver-api/infrastructure.rst +++ b/Documentation/driver-api/infrastructure.rst @@ -41,6 +41,12 @@ Device Drivers Base .. kernel-doc:: drivers/base/class.c :export: +.. kernel-doc:: include/linux/device/faux.h + :internal: + +.. kernel-doc:: drivers/base/faux.c + :export: + .. kernel-doc:: drivers/base/node.c :internal: diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 7fb21768ca36d..8074a10183dcb 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -6,7 +6,7 @@ obj-y := component.o core.o bus.o dd.o syscore.o \ cpu.o firmware.o init.o map.o devres.o \ attribute_container.o transport_class.o \ topology.o container.o property.o cacheinfo.o \ - swnode.o + swnode.o faux.o obj-$(CONFIG_AUXILIARY_BUS) += auxiliary.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o obj-y += power/ diff --git a/drivers/base/base.h b/drivers/base/base.h index 8cf04a557bdb0..0042e4774b0ce 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -137,6 +137,7 @@ int hypervisor_init(void); static inline int hypervisor_init(void) { return 0; } #endif int platform_bus_init(void); +int faux_bus_init(void); void cpu_dev_init(void); void container_dev_init(void); #ifdef CONFIG_AUXILIARY_BUS diff --git a/drivers/base/faux.c b/drivers/base/faux.c new file mode 100644 index 0000000000000..531e9d789ee04 --- /dev/null +++ b/drivers/base/faux.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#include +#include +#include +#include +#include +#include +#include "base.h" + +/* + * Internal wrapper structure so we can hold a pointer to the + * faux_device_ops for this device. + */ +struct faux_object { + struct faux_device faux_dev; + const struct faux_device_ops *faux_ops; +}; +#define to_faux_object(dev) container_of_const(dev, struct faux_object, faux_dev.dev) + +static struct device faux_bus_root = { + .init_name = "faux", +}; + +static int faux_match(struct device *dev, const struct device_driver *drv) +{ + /* Match always succeeds, we only have one driver */ + return 1; +} + +static int faux_probe(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + struct faux_device *faux_dev = &faux_obj->faux_dev; + const struct faux_device_ops *faux_ops = faux_obj->faux_ops; + int ret = 0; + + if (faux_ops && faux_ops->probe) + ret = faux_ops->probe(faux_dev); + + return ret; +} + +static void faux_remove(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + struct faux_device *faux_dev = &faux_obj->faux_dev; + const struct faux_device_ops *faux_ops = faux_obj->faux_ops; + + if (faux_ops && faux_ops->remove) + faux_ops->remove(faux_dev); +} + +static const struct bus_type faux_bus_type = { + .name = "faux", + .match = faux_match, + .probe = faux_probe, + .remove = faux_remove, +}; + +static struct device_driver faux_driver = { + .name = "faux_driver", + .bus = &faux_bus_type, + .probe_type = PROBE_FORCE_SYNCHRONOUS, +}; + +static void faux_device_release(struct device *dev) +{ + struct faux_object *faux_obj = to_faux_object(dev); + + kfree(faux_obj); +} + +/** + * faux_device_create_with_groups - Create and register with the driver + * core a faux device and populate the device with an initial + * set of sysfs attributes. + * @name: The name of the device we are adding, must be unique for + * all faux devices. + * @parent: Pointer to a potential parent struct device. If set to + * NULL, the device will be created in the "root" of the faux + * device tree in sysfs. + * @faux_ops: struct faux_device_ops that the new device will call back + * into, can be NULL. + * @groups: The set of sysfs attributes that will be created for this + * device when it is registered with the driver core. + * + * Create a new faux device and register it in the driver core properly. + * If present, callbacks in @faux_ops will be called with the device that + * for the caller to do something with at the proper time given the + * device's lifecycle. + * + * Note, when this function is called, the functions specified in struct + * faux_ops can be called before the function returns, so be prepared for + * everything to be properly initialized before that point in time. + * + * Return: + * * NULL if an error happened with creating the device + * * pointer to a valid struct faux_device that is registered with sysfs + */ +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups) +{ + struct faux_object *faux_obj; + struct faux_device *faux_dev; + struct device *dev; + int ret; + + faux_obj = kzalloc(sizeof(*faux_obj), GFP_KERNEL); + if (!faux_obj) + return NULL; + + /* Save off the callbacks so we can use them in the future */ + faux_obj->faux_ops = faux_ops; + + /* Initialize the device portion and register it with the driver core */ + faux_dev = &faux_obj->faux_dev; + dev = &faux_dev->dev; + + device_initialize(dev); + dev->release = faux_device_release; + if (parent) + dev->parent = parent; + else + dev->parent = &faux_bus_root; + dev->bus = &faux_bus_type; + dev->groups = groups; + dev_set_name(dev, "%s", name); + + ret = device_add(dev); + if (ret) { + pr_err("%s: device_add for faux device '%s' failed with %d\n", + __func__, name, ret); + put_device(dev); + return NULL; + } + + return faux_dev; +} +EXPORT_SYMBOL_GPL(faux_device_create_with_groups); + +/** + * faux_device_create - create and register with the driver core a faux device + * @name: The name of the device we are adding, must be unique for all + * faux devices. + * @parent: Pointer to a potential parent struct device. If set to + * NULL, the device will be created in the "root" of the faux + * device tree in sysfs. + * @faux_ops: struct faux_device_ops that the new device will call back + * into, can be NULL. + * + * Create a new faux device and register it in the driver core properly. + * If present, callbacks in @faux_ops will be called with the device that + * for the caller to do something with at the proper time given the + * device's lifecycle. + * + * Note, when this function is called, the functions specified in struct + * faux_ops can be called before the function returns, so be prepared for + * everything to be properly initialized before that point in time. + * + * Return: + * * NULL if an error happened with creating the device + * * pointer to a valid struct faux_device that is registered with sysfs + */ +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops) +{ + return faux_device_create_with_groups(name, parent, faux_ops, NULL); +} +EXPORT_SYMBOL_GPL(faux_device_create); + +/** + * faux_device_destroy - destroy a faux device + * @faux_dev: faux device to destroy + * + * Unregisters and cleans up a device that was created with a call to + * faux_device_create() + */ +void faux_device_destroy(struct faux_device *faux_dev) +{ + struct device *dev = &faux_dev->dev; + + if (!faux_dev) + return; + + device_del(dev); + + /* The final put_device() will clean up the memory we allocated for this device. */ + put_device(dev); +} +EXPORT_SYMBOL_GPL(faux_device_destroy); + +int __init faux_bus_init(void) +{ + int ret; + + ret = device_register(&faux_bus_root); + if (ret) { + put_device(&faux_bus_root); + return ret; + } + + ret = bus_register(&faux_bus_type); + if (ret) + goto error_bus; + + ret = driver_register(&faux_driver); + if (ret) + goto error_driver; + + return ret; + +error_driver: + bus_unregister(&faux_bus_type); + +error_bus: + device_unregister(&faux_bus_root); + return ret; +} diff --git a/drivers/base/init.c b/drivers/base/init.c index c4954835128cf..9d2b06d65dfc6 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -32,6 +32,7 @@ void __init driver_init(void) /* These are also core pieces, but must come after the * core core pieces. */ + faux_bus_init(); of_core_init(); platform_bus_init(); auxiliary_bus_init(); diff --git a/include/linux/device/faux.h b/include/linux/device/faux.h new file mode 100644 index 0000000000000..9f43c0e46aa45 --- /dev/null +++ b/include/linux/device/faux.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#ifndef _FAUX_DEVICE_H_ +#define _FAUX_DEVICE_H_ + +#include +#include + +/** + * struct faux_device - a "faux" device + * @dev: internal struct device of the object + * + * A simple faux device that can be created/destroyed. To be used when a + * driver only needs to have a device to "hang" something off. This can be + * used for downloading firmware or other basic tasks. Use this instead of + * a struct platform_device if the device has no resources assigned to + * it at all. + */ +struct faux_device { + struct device dev; +}; +#define to_faux_device(x) container_of_const((x), struct faux_device, dev) + +/** + * struct faux_device_ops - a set of callbacks for a struct faux_device + * @probe: called when a faux device is probed by the driver core + * before the device is fully bound to the internal faux bus + * code. If probe succeeds, return 0, otherwise return a + * negative error number to stop the probe sequence from + * succeeding. + * @remove: called when a faux device is removed from the system + * + * Both @probe and @remove are optional, if not needed, set to NULL. + */ +struct faux_device_ops { + int (*probe)(struct faux_device *faux_dev); + void (*remove)(struct faux_device *faux_dev); +}; + +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops); +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups); +void faux_device_destroy(struct faux_device *faux_dev); + +static inline void *faux_device_get_drvdata(const struct faux_device *faux_dev) +{ + return dev_get_drvdata(&faux_dev->dev); +} + +static inline void faux_device_set_drvdata(struct faux_device *faux_dev, void *data) +{ + dev_set_drvdata(&faux_dev->dev, data); +} + +#endif /* _FAUX_DEVICE_H_ */ -- GitLab From 78418f300d3999f1cf8a9ac71065bf2eca61f4dd Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Mon, 10 Feb 2025 13:30:26 +0100 Subject: [PATCH 543/989] rust/kernel: Add faux device bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a module for working with faux devices in rust, along with adding sample code to show how the API is used. Unlike other types of devices, we don't provide any hooks for device probe/removal - since these are optional for the faux API and are unnecessary in rust. Signed-off-by: Lyude Paul Cc: Maíra Canal Cc: Danilo Krummrich Cc: Miguel Ojeda Acked-by: Danilo Krummrich Link: https://lore.kernel.org/r/2025021026-exert-accent-b4c6@gregkh Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 + rust/bindings/bindings_helper.h | 1 + rust/kernel/faux.rs | 67 ++++++++++++++++++++++++++++++++ rust/kernel/lib.rs | 1 + samples/rust/Kconfig | 10 +++++ samples/rust/Makefile | 1 + samples/rust/rust_driver_faux.rs | 29 ++++++++++++++ 7 files changed, 111 insertions(+) create mode 100644 rust/kernel/faux.rs create mode 100644 samples/rust/rust_driver_faux.rs diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353de..19ea159b23091 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7116,8 +7116,10 @@ F: rust/kernel/device.rs F: rust/kernel/device_id.rs F: rust/kernel/devres.rs F: rust/kernel/driver.rs +F: rust/kernel/faux.rs F: rust/kernel/platform.rs F: samples/rust/rust_driver_platform.rs +F: samples/rust/rust_driver_faux.rs DRIVERS FOR OMAP ADAPTIVE VOLTAGE SCALING (AVS) M: Nishanth Menon diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 55354e4dec14e..f46cf3bb70695 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/rust/kernel/faux.rs b/rust/kernel/faux.rs new file mode 100644 index 0000000000000..5acc0c02d451f --- /dev/null +++ b/rust/kernel/faux.rs @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only + +//! Abstractions for the faux bus. +//! +//! This module provides bindings for working with faux devices in kernel modules. +//! +//! C header: [`include/linux/device/faux.h`] + +use crate::{bindings, device, error::code::*, prelude::*}; +use core::ptr::{addr_of_mut, null, null_mut, NonNull}; + +/// The registration of a faux device. +/// +/// This type represents the registration of a [`struct faux_device`]. When an instance of this type +/// is dropped, its respective faux device will be unregistered from the system. +/// +/// # Invariants +/// +/// `self.0` always holds a valid pointer to an initialized and registered [`struct faux_device`]. +/// +/// [`struct faux_device`]: srctree/include/linux/device/faux.h +#[repr(transparent)] +pub struct Registration(NonNull); + +impl Registration { + /// Create and register a new faux device with the given name. + pub fn new(name: &CStr) -> Result { + // SAFETY: + // - `name` is copied by this function into its own storage + // - `faux_ops` is safe to leave NULL according to the C API + let dev = unsafe { bindings::faux_device_create(name.as_char_ptr(), null_mut(), null()) }; + + // The above function will return either a valid device, or NULL on failure + // INVARIANT: The device will remain registered until faux_device_destroy() is called, which + // happens in our Drop implementation. + Ok(Self(NonNull::new(dev).ok_or(ENODEV)?)) + } + + fn as_raw(&self) -> *mut bindings::faux_device { + self.0.as_ptr() + } +} + +impl AsRef for Registration { + fn as_ref(&self) -> &device::Device { + // SAFETY: The underlying `device` in `faux_device` is guaranteed by the C API to be + // a valid initialized `device`. + unsafe { device::Device::as_ref(addr_of_mut!((*self.as_raw()).dev)) } + } +} + +impl Drop for Registration { + fn drop(&mut self) { + // SAFETY: `self.0` is a valid registered faux_device via our type invariants. + unsafe { bindings::faux_device_destroy(self.as_raw()) } + } +} + +// SAFETY: The faux device API is thread-safe as guaranteed by the device core, as long as +// faux_device_destroy() is guaranteed to only be called once - which is guaranteed by our type not +// having Copy/Clone. +unsafe impl Send for Registration {} + +// SAFETY: The faux device API is thread-safe as guaranteed by the device core, as long as +// faux_device_destroy() is guaranteed to only be called once - which is guaranteed by our type not +// having Copy/Clone. +unsafe impl Sync for Registration {} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 496ed32b0911a..398242f92a961 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -46,6 +46,7 @@ pub mod devres; pub mod driver; pub mod error; +pub mod faux; #[cfg(CONFIG_RUST_FW_LOADER_ABSTRACTIONS)] pub mod firmware; pub mod fs; diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig index 918dbead2c0b4..3b6eae84b2977 100644 --- a/samples/rust/Kconfig +++ b/samples/rust/Kconfig @@ -61,6 +61,16 @@ config SAMPLE_RUST_DRIVER_PLATFORM If unsure, say N. +config SAMPLE_RUST_DRIVER_FAUX + tristate "Faux Driver" + help + This option builds the Rust Faux driver sample. + + To compile this as a module, choose M here: + the module will be called rust_driver_faux. + + If unsure, say N. + config SAMPLE_RUST_HOSTPROGS bool "Host programs" help diff --git a/samples/rust/Makefile b/samples/rust/Makefile index 5a8ab0df0567c..0dbc6d90f1ef9 100644 --- a/samples/rust/Makefile +++ b/samples/rust/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_SAMPLE_RUST_MISC_DEVICE) += rust_misc_device.o obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PCI) += rust_driver_pci.o obj-$(CONFIG_SAMPLE_RUST_DRIVER_PLATFORM) += rust_driver_platform.o +obj-$(CONFIG_SAMPLE_RUST_DRIVER_FAUX) += rust_driver_faux.o rust_print-y := rust_print_main.o rust_print_events.o diff --git a/samples/rust/rust_driver_faux.rs b/samples/rust/rust_driver_faux.rs new file mode 100644 index 0000000000000..048c6cb98b29a --- /dev/null +++ b/samples/rust/rust_driver_faux.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +//! Rust faux device sample. + +use kernel::{c_str, faux, prelude::*, Module}; + +module! { + type: SampleModule, + name: "rust_faux_driver", + author: "Lyude Paul", + description: "Rust faux device sample", + license: "GPL", +} + +struct SampleModule { + _reg: faux::Registration, +} + +impl Module for SampleModule { + fn init(_module: &'static ThisModule) -> Result { + pr_info!("Initialising Rust Faux Device Sample\n"); + + let reg = faux::Registration::new(c_str!("rust-faux-sample-device"))?; + + dev_info!(reg.as_ref(), "Hello from faux device!\n"); + + Ok(Self { _reg: reg }) + } +} -- GitLab From b4f82f9ed43aefa79bec2504ae8c29be0c0f5d1d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 16 Jan 2025 10:35:03 -0500 Subject: [PATCH 544/989] Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd After the hci sync command releases l2cap_conn, the hci receive data work queue references the released l2cap_conn when sending to the upper layer. Add hci dev lock to the hci receive data work queue to synchronize the two. [1] BUG: KASAN: slab-use-after-free in l2cap_send_cmd+0x187/0x8d0 net/bluetooth/l2cap_core.c:954 Read of size 8 at addr ffff8880271a4000 by task kworker/u9:2/5837 CPU: 0 UID: 0 PID: 5837 Comm: kworker/u9:2 Not tainted 6.13.0-rc5-syzkaller-00163-gab75170520d4 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: hci1 hci_rx_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x169/0x550 mm/kasan/report.c:489 kasan_report+0x143/0x180 mm/kasan/report.c:602 l2cap_build_cmd net/bluetooth/l2cap_core.c:2964 [inline] l2cap_send_cmd+0x187/0x8d0 net/bluetooth/l2cap_core.c:954 l2cap_sig_send_rej net/bluetooth/l2cap_core.c:5502 [inline] l2cap_sig_channel net/bluetooth/l2cap_core.c:5538 [inline] l2cap_recv_frame+0x221f/0x10db0 net/bluetooth/l2cap_core.c:6817 hci_acldata_packet net/bluetooth/hci_core.c:3797 [inline] hci_rx_work+0x508/0xdb0 net/bluetooth/hci_core.c:4040 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5837: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x243/0x390 mm/slub.c:4329 kmalloc_noprof include/linux/slab.h:901 [inline] kzalloc_noprof include/linux/slab.h:1037 [inline] l2cap_conn_add+0xa9/0x8e0 net/bluetooth/l2cap_core.c:6860 l2cap_connect_cfm+0x115/0x1090 net/bluetooth/l2cap_core.c:7239 hci_connect_cfm include/net/bluetooth/hci_core.h:2057 [inline] hci_remote_features_evt+0x68e/0xac0 net/bluetooth/hci_event.c:3726 hci_event_func net/bluetooth/hci_event.c:7473 [inline] hci_event_packet+0xac2/0x1540 net/bluetooth/hci_event.c:7525 hci_rx_work+0x3f3/0xdb0 net/bluetooth/hci_core.c:4035 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Freed by task 54: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x59/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x196/0x430 mm/slub.c:4761 l2cap_connect_cfm+0xcc/0x1090 net/bluetooth/l2cap_core.c:7235 hci_connect_cfm include/net/bluetooth/hci_core.h:2057 [inline] hci_conn_failed+0x287/0x400 net/bluetooth/hci_conn.c:1266 hci_abort_conn_sync+0x56c/0x11f0 net/bluetooth/hci_sync.c:5603 hci_cmd_sync_work+0x22b/0x400 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xa66/0x1840 kernel/workqueue.c:3310 worker_thread+0x870/0xd30 kernel/workqueue.c:3391 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Reported-by: syzbot+31c2f641b850a348a734@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=31c2f641b850a348a734 Tested-by: syzbot+31c2f641b850a348a734@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 39 +++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 27b4c4a2ba1fd..adb8c33ac5953 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -948,6 +948,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) return id; } +static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, + u8 flags) +{ + /* Check if the hcon still valid before attempting to send */ + if (hci_conn_valid(conn->hcon->hdev, conn->hcon)) + hci_send_acl(conn->hchan, skb, flags); + else + kfree_skb(skb); +} + static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) { @@ -970,7 +980,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, flags); + l2cap_send_acl(conn, skb, flags); } static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) @@ -1792,13 +1802,10 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) mutex_unlock(&conn->chan_lock); - hci_chan_del(conn->hchan); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); hcon->l2cap_data = NULL; - conn->hchan = NULL; l2cap_conn_put(conn); } @@ -1806,6 +1813,7 @@ static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); + hci_chan_del(conn->hchan); hci_conn_put(conn->hcon); kfree(conn); } @@ -7466,14 +7474,33 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } +static struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +{ + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); + + if (!kref_get_unless_zero(&c->ref)) + return NULL; + + return c; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn; int len; + /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ + hci_dev_lock(hcon->hdev); + + conn = hcon->l2cap_data; + if (!conn) conn = l2cap_conn_add(hcon); + conn = l2cap_conn_hold_unless_zero(conn); + + hci_dev_unlock(hcon->hdev); + if (!conn) goto drop; @@ -7565,6 +7592,8 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) break; } + l2cap_conn_put(conn); + drop: kfree_skb(skb); } -- GitLab From 872274b992839ff64fe560767fe7ee5f942ccdb1 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Fri, 31 Jan 2025 18:30:19 +0530 Subject: [PATCH 545/989] Bluetooth: btintel_pcie: Fix a potential race condition On HCI_OP_RESET command, firmware raises alive interrupt. Driver needs to wait for this before sending other command. This patch fixes the potential miss of alive interrupt due to which HCI_OP_RESET can timeout. Expected flow: If tx command is HCI_OP_RESET, 1. set data->gp0_received = false 2. send HCI_OP_RESET 3. wait for alive interrupt Actual flow having potential race: If tx command is HCI_OP_RESET, 1. send HCI_OP_RESET 1a. Firmware raises alive interrupt here and in ISR data->gp0_received is set to true 2. set data->gp0_received = false 3. wait for alive interrupt Signed-off-by: Kiran K Fixes: 05c200c8f029 ("Bluetooth: btintel_pcie: Add handshake between driver and firmware") Reported-by: Bjorn Helgaas Closes: https://patchwork.kernel.org/project/bluetooth/patch/20241001104451.626964-1-kiran.k@intel.com/ Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 2b79952f3628d..091ffe3e14954 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -1320,6 +1320,10 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, if (opcode == 0xfc01) btintel_pcie_inject_cmd_complete(hdev, opcode); } + /* Firmware raises alive interrupt on HCI_OP_RESET */ + if (opcode == HCI_OP_RESET) + data->gp0_received = false; + hdev->stat.cmd_tx++; break; case HCI_ACLDATA_PKT: @@ -1357,7 +1361,6 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev, opcode, btintel_pcie_alivectxt_state2str(old_ctxt), btintel_pcie_alivectxt_state2str(data->alive_intr_ctxt)); if (opcode == HCI_OP_RESET) { - data->gp0_received = false; ret = wait_event_timeout(data->gp0_wait_q, data->gp0_received, msecs_to_jiffies(BTINTEL_DEFAULT_INTR_TIMEOUT_MS)); -- GitLab From ab4eedb790cae44313759b50fe47da285e2519d5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Feb 2025 15:54:45 -0500 Subject: [PATCH 546/989] Bluetooth: L2CAP: Fix corrupted list in hci_chan_del This fixes the following trace by reworking the locking of l2cap_conn so instead of only locking when changing the chan_l list this promotes chan_lock to a general lock of l2cap_conn so whenever it is being held it would prevents the likes of l2cap_conn_del to run: list_del corruption, ffff888021297e00->prev is LIST_POISON2 (dead000000000122) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:61! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5896 Comm: syz-executor213 Not tainted 6.14.0-rc1-next-20250204-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024 RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7aceeeb1d0 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:168 [inline] hci_chan_del+0x70/0x1b0 net/bluetooth/hci_conn.c:2858 l2cap_conn_free net/bluetooth/l2cap_core.c:1816 [inline] kref_put include/linux/kref.h:65 [inline] l2cap_conn_put+0x70/0xe0 net/bluetooth/l2cap_core.c:1830 l2cap_sock_shutdown+0xa8a/0x1020 net/bluetooth/l2cap_sock.c:1377 l2cap_sock_release+0x79/0x1d0 net/bluetooth/l2cap_sock.c:1416 __sock_release net/socket.c:642 [inline] sock_close+0xbc/0x240 net/socket.c:1393 __fput+0x3e9/0x9f0 fs/file_table.c:448 task_work_run+0x24f/0x310 kernel/task_work.c:227 ptrace_notify+0x2d2/0x380 kernel/signal.c:2522 ptrace_report_syscall include/linux/ptrace.h:415 [inline] ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline] syscall_exit_work+0xc7/0x1d0 kernel/entry/common.c:173 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline] syscall_exit_to_user_mode+0x24a/0x340 kernel/entry/common.c:218 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7aceeaf449 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 41 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f7ace668218 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: fffffffffffffffc RBX: 00007f7acef39328 RCX: 00007f7aceeaf449 RDX: 000000000000000e RSI: 0000000020000100 RDI: 0000000000000004 RBP: 00007f7acef39320 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000000004 R14: 00007f7ace668670 R15: 000000000000000b Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:__list_del_entry_valid_or_report+0x12c/0x190 lib/list_debug.c:59 Code: 8c 4c 89 fe 48 89 da e8 32 8c 37 fc 90 0f 0b 48 89 df e8 27 9f 14 fd 48 c7 c7 a0 c0 60 8c 4c 89 fe 48 89 da e8 15 8c 37 fc 90 <0f> 0b 4c 89 e7 e8 0a 9f 14 fd 42 80 3c 2b 00 74 08 4c 89 e7 e8 cb RSP: 0018:ffffc90003f6f998 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: 01454d423f7fbf00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff819f077c R09: 1ffff920007eded0 R10: dffffc0000000000 R11: fffff520007eded1 R12: dead000000000122 R13: dffffc0000000000 R14: ffff8880352248d8 R15: ffff888021297e00 FS: 00007f7ace6686c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7acef05b08 CR3: 000000003527c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Reported-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Tested-by: syzbot+10bd8fe6741eedd2be2e@syzkaller.appspotmail.com Fixes: b4f82f9ed43a ("Bluetooth: L2CAP: Fix slab-use-after-free Read in l2cap_send_cmd") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Dan Carpenter --- include/net/bluetooth/l2cap.h | 3 +- net/bluetooth/l2cap_core.c | 138 ++++++++++++---------------------- net/bluetooth/l2cap_sock.c | 15 ++-- 3 files changed, 58 insertions(+), 98 deletions(-) diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index d9c767cf773de..9189354c568f4 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -668,7 +668,7 @@ struct l2cap_conn { struct l2cap_chan *smp; struct list_head chan_l; - struct mutex chan_lock; + struct mutex lock; struct kref ref; struct list_head users; }; @@ -970,6 +970,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index adb8c33ac5953..fec11e576f310 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work) if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. */ @@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -641,9 +637,9 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_add(conn, chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } void l2cap_chan_del(struct l2cap_chan *chan, int err) @@ -731,9 +727,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_list(conn, func, data); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL_GPL(l2cap_chan_list); @@ -745,7 +741,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -754,7 +750,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) @@ -1507,8 +1503,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1577,8 +1571,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_le_conn_ready(struct l2cap_conn *conn) @@ -1624,7 +1616,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == ACL_LINK) l2cap_request_info(conn); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1642,7 +1634,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); @@ -1657,14 +1649,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) l2cap_chan_set_err(chan, err); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_info_timeout(struct work_struct *work) @@ -1675,7 +1663,9 @@ static void l2cap_info_timeout(struct work_struct *work) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; + mutex_lock(&conn->lock); l2cap_conn_start(conn); + mutex_unlock(&conn->lock); } /* @@ -1767,6 +1757,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + mutex_lock(&conn->lock); + kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); @@ -1785,8 +1777,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) /* Force the connection to be immediately dropped */ hcon->disc_timeout = 0; - mutex_lock(&conn->chan_lock); - /* Kill channels */ list_for_each_entry_safe(chan, l, &conn->chan_l, list) { l2cap_chan_hold(chan); @@ -1800,12 +1790,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - mutex_unlock(&conn->chan_lock); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); + hci_chan_del(conn->hchan); + conn->hchan = NULL; + hcon->l2cap_data = NULL; + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -1813,7 +1805,6 @@ static void l2cap_conn_free(struct kref *ref) { struct l2cap_conn *conn = container_of(ref, struct l2cap_conn, ref); - hci_chan_del(conn->hchan); hci_conn_put(conn->hcon); kfree(conn); } @@ -2924,8 +2915,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (chan->chan_type != L2CAP_CHAN_RAW) continue; @@ -2940,8 +2929,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) if (chan->ops->recv(chan, nskb)) kfree_skb(nskb); } - - mutex_unlock(&conn->chan_lock); } /* ---- L2CAP signalling commands ---- */ @@ -3960,7 +3947,6 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ @@ -4067,7 +4053,6 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, } l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); } @@ -4106,27 +4091,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); - mutex_lock(&conn->chan_lock); - if (scid) { chan = __l2cap_get_chan_by_scid(conn, scid); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } else { chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } chan = l2cap_chan_hold_unless_zero(chan); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4164,9 +4141,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); l2cap_chan_put(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4454,11 +4428,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4495,11 +4465,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4697,13 +4663,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", dcid, mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4751,9 +4713,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4865,7 +4824,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -4931,7 +4889,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) @@ -5065,7 +5022,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -5140,7 +5096,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); response: @@ -5177,8 +5132,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { @@ -5264,8 +5217,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); - return err; } @@ -5378,8 +5329,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (cmd_len < sizeof(*rej)) return -EPROTO; - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); if (!chan) goto done; @@ -5394,7 +5343,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, l2cap_chan_put(chan); done: - mutex_unlock(&conn->chan_lock); return 0; } @@ -6849,8 +6797,12 @@ static void process_pending_rx(struct work_struct *work) BT_DBG(""); + mutex_lock(&conn->lock); + while ((skb = skb_dequeue(&conn->pending_rx))) l2cap_recv_frame(conn, skb); + + mutex_unlock(&conn->lock); } static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) @@ -6889,7 +6841,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); - mutex_init(&conn->chan_lock); + mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); @@ -7080,7 +7032,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, } } - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { @@ -7121,7 +7073,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan_unlock: l2cap_chan_unlock(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); done: hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -7333,7 +7285,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7407,7 +7359,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } /* Append fragment into frame respecting the maximum len of rx_skb */ @@ -7474,8 +7426,11 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } -static struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) { + if (!c) + return NULL; + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); if (!kref_get_unless_zero(&c->ref)) @@ -7501,11 +7456,15 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) hci_dev_unlock(hcon->hdev); - if (!conn) - goto drop; + if (!conn) { + kfree_skb(skb); + return; + } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); + mutex_lock(&conn->lock); + switch (flags) { case ACL_START: case ACL_START_NO_FLUSH: @@ -7530,7 +7489,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return; + goto unlock; } BT_DBG("Start: total len %d, frag len %u", len, skb->len); @@ -7592,10 +7551,11 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) break; } - l2cap_conn_put(conn); - drop: kfree_skb(skb); +unlock: + mutex_unlock(&conn->lock); + l2cap_conn_put(conn); } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 46ea0bee2259f..acd11b268b98a 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1326,9 +1326,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); - chan = l2cap_pi(sk)->chan; /* prevent chan structure from being freed whilst unlocked */ - l2cap_chan_hold(chan); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + if (!chan) + goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1358,22 +1359,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) release_sock(sk); l2cap_chan_lock(chan); - conn = chan->conn; - if (conn) - /* prevent conn structure from being freed */ - l2cap_conn_get(conn); + /* prevent conn structure from being freed */ + conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } -- GitLab From 5bef3ac184b5626ea62385d6b82a1992b89d7940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2025 13:49:28 +0000 Subject: [PATCH 547/989] team: better TEAM_OPTION_TYPE_STRING validation syzbot reported following splat [1] Make sure user-provided data contains one nul byte. [1] BUG: KMSAN: uninit-value in string_nocheck lib/vsprintf.c:633 [inline] BUG: KMSAN: uninit-value in string+0x3ec/0x5f0 lib/vsprintf.c:714 string_nocheck lib/vsprintf.c:633 [inline] string+0x3ec/0x5f0 lib/vsprintf.c:714 vsnprintf+0xa5d/0x1960 lib/vsprintf.c:2843 __request_module+0x252/0x9f0 kernel/module/kmod.c:149 team_mode_get drivers/net/team/team_core.c:480 [inline] team_change_mode drivers/net/team/team_core.c:607 [inline] team_mode_option_set+0x437/0x970 drivers/net/team/team_core.c:1401 team_option_set drivers/net/team/team_core.c:375 [inline] team_nl_options_set_doit+0x1339/0x1f90 drivers/net/team/team_core.c:2662 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x1214/0x12c0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x375/0x650 net/netlink/af_netlink.c:2543 genl_rcv+0x40/0x60 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0xf52/0x1260 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x10da/0x11e0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:718 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:733 ____sys_sendmsg+0x877/0xb60 net/socket.c:2573 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2627 __sys_sendmsg net/socket.c:2659 [inline] __do_sys_sendmsg net/socket.c:2664 [inline] __se_sys_sendmsg net/socket.c:2662 [inline] __x64_sys_sendmsg+0x212/0x3c0 net/socket.c:2662 x64_sys_call+0x2ed6/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Reported-by: syzbot+1fcd957a82e3a1baa94d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1fcd957a82e3a1baa94d Signed-off-by: Eric Dumazet Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250212134928.1541609-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/team/team_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index dc7cbd6a9798a..f4019815f4736 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2639,7 +2639,9 @@ int team_nl_options_set_doit(struct sk_buff *skb, struct genl_info *info) ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: - if (nla_len(attr_data) > TEAM_STRING_MAX_LEN) { + if (nla_len(attr_data) > TEAM_STRING_MAX_LEN || + !memchr(nla_data(attr_data), '\0', + nla_len(attr_data))) { err = -EINVAL; goto team_put; } -- GitLab From a527750d877fd334de87eef81f1cb5f0f0ca3373 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2025 14:10:21 +0000 Subject: [PATCH 548/989] ipv6: mcast: add RCU protection to mld_newpack() mld_newpack() can be called without RTNL or RCU being held. Note that we no longer can use sock_alloc_send_skb() because ipv6.igmp_sk uses GFP_KERNEL allocations which can sleep. Instead use alloc_skb() and charge the net->ipv6.igmp_sk socket under RCU protection. Fixes: b8ad0cbc58f7 ("[NETNS][IPV6] mcast - handle several network namespace") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250212141021.1663666-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 81a739ebf7094..65831b4fee1fd 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1773,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1795,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* : * use unspecified address as the source address @@ -1806,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); -- GitLab From fee5d688940690cc845937459e340e4e02598e90 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 12 Feb 2025 23:23:11 +0800 Subject: [PATCH 549/989] mlxsw: Add return value check for mlxsw_sp_port_get_stats_raw() Add a check for the return value of mlxsw_sp_port_get_stats_raw() in __mlxsw_sp_port_get_stats(). If mlxsw_sp_port_get_stats_raw() returns an error, exit the function to prevent further processing with potentially invalid data. Fixes: 614d509aa1e7 ("mlxsw: Move ethtool_ops to spectrum_ethtool.c") Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Wentao Liang Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250212152311.1332-1-vulab@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 2bed8c86b7cfc..3f64cdbabfa3c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -768,7 +768,9 @@ static void __mlxsw_sp_port_get_stats(struct net_device *dev, err = mlxsw_sp_get_hw_stats_by_group(&hw_stats, &len, grp); if (err) return; - mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + err = mlxsw_sp_port_get_stats_raw(dev, grp, prio, ppcnt_pl); + if (err) + return; for (i = 0; i < len; i++) { data[data_index + i] = hw_stats[i].getter(ppcnt_pl); if (!hw_stats[i].cells_bytes) -- GitLab From 0d0b752f2497471ddd2b32143d167d42e18a8f3c Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 12 Feb 2025 17:36:59 +0100 Subject: [PATCH 550/989] s390/qeth: move netif_napi_add_tx() and napi_enable() from under BH Like other drivers qeth is calling local_bh_enable() after napi_schedule() to kick-start softirqs [0]. Since netif_napi_add_tx() and napi_enable() now take the netdev_lock() mutex [1], move them out from under the BH protection. Same solution as in commit a60558644e20 ("wifi: mt76: move napi_enable() from under BH") Fixes: 1b23cdbd2bbc ("net: protect netdev->napi_list with netdev_lock()") Link: https://lore.kernel.org/netdev/20240612181900.4d9d18d0@kernel.org/ [0] Link: https://lore.kernel.org/netdev/20250115035319.559603-1-kuba@kernel.org/ [1] Signed-off-by: Alexandra Winter Acked-by: Joe Damato Link: https://patch.msgid.link/20250212163659.2287292-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index a3adaec5504e4..20328d695ef92 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -7050,14 +7050,16 @@ int qeth_open(struct net_device *dev) card->data.state = CH_STATE_UP; netif_tx_start_all_queues(dev); - local_bh_disable(); qeth_for_each_output_queue(card, queue, i) { netif_napi_add_tx(dev, &queue->napi, qeth_tx_poll); napi_enable(&queue->napi); - napi_schedule(&queue->napi); } - napi_enable(&card->napi); + + local_bh_disable(); + qeth_for_each_output_queue(card, queue, i) { + napi_schedule(&queue->napi); + } napi_schedule(&card->napi); /* kick-start the NAPI softirq: */ local_bh_enable(); -- GitLab From 0760d62dad5d3e76c2aa175d9bc42b5f664967c2 Mon Sep 17 00:00:00 2001 From: Devaansh Kumar Date: Tue, 11 Feb 2025 22:48:48 +0530 Subject: [PATCH 551/989] sched_ext: selftests: Fix grammar in tests description Fixed grammar for a few tests of sched_ext. Signed-off-by: Devaansh Kumar Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/init_enable_count.c | 2 +- tools/testing/selftests/sched_ext/maybe_null.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index 0f3eddc7a17a0..eddf9e0e26e7f 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -150,7 +150,7 @@ static enum scx_test_status run(void *ctx) struct scx_test init_enable_count = { .name = "init_enable_count", - .description = "Verify we do the correct amount of counting of init, " + .description = "Verify we correctly count the occurrences of init, " "enable, etc callbacks.", .run = run, }; diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c index 31cfafb0cf65a..aacf0c58ca4fa 100644 --- a/tools/testing/selftests/sched_ext/maybe_null.c +++ b/tools/testing/selftests/sched_ext/maybe_null.c @@ -43,7 +43,7 @@ static enum scx_test_status run(void *ctx) struct scx_test maybe_null = { .name = "maybe_null", - .description = "Verify if PTR_MAYBE_NULL work for .dispatch", + .description = "Verify if PTR_MAYBE_NULL works for .dispatch", .run = run, }; REGISTER_SCX_TEST(&maybe_null) -- GitLab From a8972d5a49b408248294b5ecbdd0a085e4726349 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Fri, 27 Sep 2024 09:53:05 -0400 Subject: [PATCH 552/989] drm: panel: jd9365da-h3: fix reset signal polarity In jadard_prepare() a reset pulse is generated with the following statements (delays ommited for clarity): gpiod_set_value(jadard->reset, 1); --> Deassert reset gpiod_set_value(jadard->reset, 0); --> Assert reset for 10ms gpiod_set_value(jadard->reset, 1); --> Deassert reset However, specifying second argument of "0" to gpiod_set_value() means to deassert the GPIO, and "1" means to assert it. If the reset signal is defined as GPIO_ACTIVE_LOW in the DTS, the above statements will incorrectly generate the reset pulse (inverted) and leave it asserted (LOW) at the end of jadard_prepare(). Fix reset behavior by inverting gpiod_set_value() second argument in jadard_prepare(). Also modify second argument to devm_gpiod_get() in jadard_dsi_probe() to assert the reset when probing. Do not modify it in jadard_unprepare() as it is already properly asserted with "1", which seems to be the intended behavior. Fixes: 6b818c533dd8 ("drm: panel: Add Jadard JD9365DA-H3 DSI panel") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Reviewed-by: Neil Armstrong Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240927135306.857617-1-hugo@hugovil.com Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240927135306.857617-1-hugo@hugovil.com --- drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c index 45d09e6fa667f..7d68a8acfe2ea 100644 --- a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c +++ b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c @@ -109,13 +109,13 @@ static int jadard_prepare(struct drm_panel *panel) if (jadard->desc->lp11_to_reset_delay_ms) msleep(jadard->desc->lp11_to_reset_delay_ms); - gpiod_set_value(jadard->reset, 1); + gpiod_set_value(jadard->reset, 0); msleep(5); - gpiod_set_value(jadard->reset, 0); + gpiod_set_value(jadard->reset, 1); msleep(10); - gpiod_set_value(jadard->reset, 1); + gpiod_set_value(jadard->reset, 0); msleep(130); ret = jadard->desc->init(jadard); @@ -1130,7 +1130,7 @@ static int jadard_dsi_probe(struct mipi_dsi_device *dsi) dsi->format = desc->format; dsi->lanes = desc->lanes; - jadard->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); + jadard->reset = devm_gpiod_get(dev, "reset", GPIOD_OUT_HIGH); if (IS_ERR(jadard->reset)) { DRM_DEV_ERROR(&dsi->dev, "failed to get our reset GPIO\n"); return PTR_ERR(jadard->reset); -- GitLab From 2e2006c91c842c551521434466f9b4324719c9a7 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 12 Feb 2025 15:19:36 +0800 Subject: [PATCH 553/989] sched_ext: Fix the incorrect bpf_list kfunc API in common.bpf.h. Now BPF only supports bpf_list_push_{front,back}_impl kfunc, not bpf_list_ push_{front,back}. This patch fix this issue. Without this patch, if we use bpf_list kfunc in scx, the BPF verifier would complain: libbpf: extern (func ksym) 'bpf_list_push_back': not found in kernel or module BTFs libbpf: failed to load object 'scx_foo' libbpf: failed to load BPF skeleton 'scx_foo': -EINVAL With this patch, the bpf list kfunc will work as expected. Signed-off-by: Chuyi Zhou Fixes: 2a52ca7c98960 ("sched_ext: Add scx_simple and scx_example_qmap example schedulers") Signed-off-by: Tejun Heo --- tools/sched_ext/include/scx/common.bpf.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d72b60a0c582c..7849405614b15 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -270,8 +270,16 @@ void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +int bpf_list_push_front_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0) + +int bpf_list_push_back_impl(struct bpf_list_head *head, + struct bpf_list_node *node, + void *meta, __u64 off) __ksym; +#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0) + struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, -- GitLab From 0892b840318daa6ae739b7cdec5ecdfca4006689 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Feb 2025 08:49:44 -0800 Subject: [PATCH 554/989] Reapply "net: skb: introduce and use a single page frag cache" This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76. Sabrina reports that the revert may trigger warnings due to intervening changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it and revisit once that part is also ironed out. Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") Reported-by: Sabrina Dubroca Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + net/core/dev.c | 17 ------- net/core/skbuff.c | 103 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 22 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 365f0e2098d13..c0a86afb85daa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 55e356a68db66..b91658e8aedb4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6920,23 +6920,6 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -static void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6a99c453397fc..a441613a1e6c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -220,9 +220,67 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -232,6 +290,23 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -738,8 +813,10 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -749,16 +826,32 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } - len = SKB_HEAD_ALIGN(len); - if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len = SKB_HEAD_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); + } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- GitLab From f5717c93a1b999970f3a64d771a1a9ee68cc37d0 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 12 Feb 2025 21:09:35 +0800 Subject: [PATCH 555/989] sched_ext: Use SCX_CALL_OP_TASK in task_tick_scx Now when we use scx_bpf_task_cgroup() in ops.tick() to get the cgroup of the current task, the following error will occur: scx_foo[3795244] triggered exit kind 1024: runtime error (called on a task not being operated on) The reason is that we are using SCX_CALL_OP() instead of SCX_CALL_OP_TASK() when calling ops.tick(), which triggers the error during the subsequent scx_kf_allowed_on_arg_tasks() check. SCX_CALL_OP_TASK() was first introduced in commit 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") to ensure task's rq lock is held when accessing task's sched_group. Since ops.tick() is marked as SCX_KF_TERMINAL and task_tick_scx() is protected by the rq lock, we can use SCX_CALL_OP_TASK() to avoid the above issue. Similarly, the same changes should be made for ops.disable() and ops.exit_task(), as they are also protected by task_rq_lock() and it's safe to access the task's task_group. Fixes: 36454023f50b ("sched_ext: Track tasks that are subjects of the in-flight SCX operation") Signed-off-by: Chuyi Zhou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 54edd0e2132a6..5a81d9a1e31f2 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3899,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -4046,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -4075,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } -- GitLab From d6211ebbdaa541af197b50b8dd8f22642ce0b87f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:24:23 -0700 Subject: [PATCH 556/989] io_uring/uring_cmd: unconditionally copy SQEs at prep time This isn't generally necessary, but conditions have been observed where SQE data is accessed from the original SQE after prep has been done and outside of the initial issue. Opcode prep handlers must ensure that any SQE related data is stable beyond the prep phase, but uring_cmd is a bit special in how it handles the SQE which makes it susceptible to reading stale data. If the application has reused the SQE before the original completes, then that can lead to data corruption. Down the line we can relax this again once uring_cmd has been sanitized a bit, and avoid unnecessarily copying the SQE. Fixes: 5eff57fa9f3a ("io_uring/uring_cmd: defer SQE copying until it's needed") Reported-by: Caleb Sander Mateos Reviewed-by: Caleb Sander Mateos Reviewed-by: Li Zetao Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8af7780407b7e..e6701b7aa1474 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -165,15 +165,6 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -static void io_uring_cmd_cache_sqes(struct io_kiocb *req) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache = req->async_data; - - memcpy(cache->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; -} - static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -185,10 +176,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return -ENOMEM; cache->op_data = NULL; - ioucmd->sqe = sqe; - /* defer memcpy until we need it */ - if (unlikely(req->flags & REQ_F_FORCE_ASYNC)) - io_uring_cmd_cache_sqes(req); + /* + * Unconditionally cache the SQE for now - this is only needed for + * requests that go async, but prep handlers must ensure that any + * sqe data is stable beyond prep. Since uring_cmd is special in + * that it doesn't read in per-op data, play it safe and ensure that + * any SQE data is stable beyond prep. This can later get relaxed. + */ + memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = cache->sqes; return 0; } @@ -251,16 +247,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - struct io_uring_cmd_data *cache = req->async_data; - - if (ioucmd->sqe != cache->sqes) - io_uring_cmd_cache_sqes(req); - return -EAGAIN; - } else if (ret == -EIOCBQUEUED) { - return -EIOCBQUEUED; - } - + if (ret == -EAGAIN || ret == -EIOCBQUEUED) + return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); -- GitLab From fbe8f2fa971c537571994a0df532c511c4fb5537 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 12 Feb 2025 09:11:07 -0800 Subject: [PATCH 557/989] md/raid*: Fix the set_queue_limits implementations queue_limits_cancel_update() must only be called if queue_limits_start_update() is called first. Remove the queue_limits_cancel_update() calls from the raid*_set_limits() functions because there is no corresponding queue_limits_start_update() call. Cc: Christoph Hellwig Fixes: c6e56cf6b2e7 ("block: move integrity information into queue_limits") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/linux-raid/20250212171108.3483150-1-bvanassche@acm.org/ Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 4 +--- drivers/md/raid1.c | 4 +--- drivers/md/raid10.c | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 8fc9339b00c72..70bcc3cdf2cd1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -386,10 +386,8 @@ static int raid0_set_limits(struct mddev *mddev) lim.io_opt = lim.io_min * mddev->raid_disks; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9d57a88dbd261..10ea3af40991d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3219,10 +3219,8 @@ static int raid1_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index efe93b9791677..15b9ae5bf84d8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4020,10 +4020,8 @@ static int raid10_set_queue_limits(struct mddev *mddev) lim.io_opt = lim.io_min * raid10_nr_stripes(conf); lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); - if (err) { - queue_limits_cancel_update(mddev->gendisk->queue); + if (err) return err; - } return queue_limits_set(mddev->gendisk->queue, &lim); } -- GitLab From cd57e4327707126dca3f9517b84274c001d4c184 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Tue, 11 Feb 2025 10:29:48 +0800 Subject: [PATCH 558/989] phy: freescale: fsl-samsung-hdmi: Limit PLL lock detection clock divider to valid range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FIELD_PREP() checks that a value fits into the available bitfield, but the index div equals to 4,is out of range. which gcc complains about: In function ‘fsl_samsung_hdmi_phy_configure_pll_lock_det’, inlined from ‘fsl_samsung_hdmi_phy_configure’ at drivers/phy/freescale/phy-fsl-samsung-hdmi.c :470:2: ././include/linux/compiler_types.h:542:38: error: call to ‘__compiletime_assert_538’ declared with attribute error: FIELD_PREP: value too large for the field 542 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ ././include/linux/compiler_types.h:523:4: note: in definition of macro ‘__compiletime_assert’ 523 | prefix ## suffix(); | ^~~~~~ ././include/linux/compiler_types.h:542:2: note: in expansion of macro ‘_compiletime_assert’ 542 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) REG12_CK_DIV_MASK only two bit, limit div to range 0~3, so build error will fix. Fixes: d567679f2b6a ("phy: freescale: fsl-samsung-hdmi: Clean up fld_tg_code calculation") Signed-off-by: Pei Xiao Changlog: Reviewed-by: Adam Ford Link: https://lore.kernel.org/r/tencent_6F503D43467AA99DD8CC59B8F645F0725B0A@qq.com Signed-off-by: Vinod Koul --- drivers/phy/freescale/phy-fsl-samsung-hdmi.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c index 45004f598e4dc..e4c0a82d16d9e 100644 --- a/drivers/phy/freescale/phy-fsl-samsung-hdmi.c +++ b/drivers/phy/freescale/phy-fsl-samsung-hdmi.c @@ -325,7 +325,7 @@ to_fsl_samsung_hdmi_phy(struct clk_hw *hw) return container_of(hw, struct fsl_samsung_hdmi_phy, hw); } -static void +static int fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, const struct phy_config *cfg) { @@ -341,6 +341,9 @@ fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, break; } + if (unlikely(div == 4)) + return -EINVAL; + writeb(FIELD_PREP(REG12_CK_DIV_MASK, div), phy->regs + PHY_REG(12)); /* @@ -364,6 +367,8 @@ fsl_samsung_hdmi_phy_configure_pll_lock_det(struct fsl_samsung_hdmi_phy *phy, FIELD_PREP(REG14_RP_CODE_MASK, 2) | FIELD_PREP(REG14_TG_CODE_HIGH_MASK, fld_tg_code >> 8), phy->regs + PHY_REG(14)); + + return 0; } static unsigned long fsl_samsung_hdmi_phy_find_pms(unsigned long fout, u8 *p, u16 *m, u8 *s) @@ -466,7 +471,11 @@ static int fsl_samsung_hdmi_phy_configure(struct fsl_samsung_hdmi_phy *phy, writeb(REG21_SEL_TX_CK_INV | FIELD_PREP(REG21_PMS_S_MASK, cfg->pll_div_regs[2] >> 4), phy->regs + PHY_REG(21)); - fsl_samsung_hdmi_phy_configure_pll_lock_det(phy, cfg); + ret = fsl_samsung_hdmi_phy_configure_pll_lock_det(phy, cfg); + if (ret) { + dev_err(phy->dev, "pixclock too large\n"); + return ret; + } writeb(REG33_FIX_DA | REG33_MODE_SET_DONE, phy->regs + PHY_REG(33)); -- GitLab From 7b4aebeecbbd5b5fe73e35fad3f62ed21aa7ef44 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 13 Feb 2025 17:56:46 +0200 Subject: [PATCH 559/989] gpiolib: Fix crash on error in gpiochip_get_ngpios() The gpiochip_get_ngpios() uses chip_*() macros to print messages. However these macros rely on gpiodev to be initialised and set, which is not the case when called via bgpio_init(). In such a case the printing messages will crash on NULL pointer dereference. Replace chip_*() macros by the respective dev_*() ones to avoid such crash. Fixes: 55b2395e4e92 ("gpio: mmio: handle "ngpios" properly in bgpio_init()") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250213155646.2882324-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 679ed764cb143..ca2f58a2cd45e 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -904,13 +904,13 @@ int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev) } if (gc->ngpio == 0) { - chip_err(gc, "tried to insert a GPIO chip with zero lines\n"); + dev_err(dev, "tried to insert a GPIO chip with zero lines\n"); return -EINVAL; } if (gc->ngpio > FASTPATH_NGPIO) - chip_warn(gc, "line cnt %u is greater than fast path cnt %u\n", - gc->ngpio, FASTPATH_NGPIO); + dev_warn(dev, "line cnt %u is greater than fast path cnt %u\n", + gc->ngpio, FASTPATH_NGPIO); return 0; } -- GitLab From fbc7e61195e23f744814e78524b73b59faa54ab4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:19 +0000 Subject: [PATCH 560/989] KVM: arm64: Unconditionally save+flush host FPSIMD/SVE/SME state There are several problems with the way hyp code lazily saves the host's FPSIMD/SVE state, including: * Host SVE being discarded unexpectedly due to inconsistent configuration of TIF_SVE and CPACR_ELx.ZEN. This has been seen to result in QEMU crashes where SVE is used by memmove(), as reported by Eric Auger: https://issues.redhat.com/browse/RHEL-68997 * Host SVE state is discarded *after* modification by ptrace, which was an unintentional ptrace ABI change introduced with lazy discarding of SVE state. * The host FPMR value can be discarded when running a non-protected VM, where FPMR support is not exposed to a VM, and that VM uses FPSIMD/SVE. In these cases the hyp code does not save the host's FPMR before unbinding the host's FPSIMD/SVE/SME state, leaving a stale value in memory. Avoid these by eagerly saving and "flushing" the host's FPSIMD/SVE/SME state when loading a vCPU such that KVM does not need to save any of the host's FPSIMD/SVE/SME state. For clarity, fpsimd_kvm_prepare() is removed and the necessary call to fpsimd_save_and_flush_cpu_state() is placed in kvm_arch_vcpu_load_fp(). As 'fpsimd_state' and 'fpmr_ptr' should not be used, they are set to NULL; all uses of these will be removed in subsequent patches. Historical problems go back at least as far as v5.17, e.g. erroneous assumptions about TIF_SVE being clear in commit: 8383741ab2e773a9 ("KVM: arm64: Get rid of host SVE tracking/saving") ... and so this eager save+flush probably needs to be backported to ALL stable trees. Fixes: 93ae6b01bafee8fa ("KVM: arm64: Discard any SVE state when entering KVM guests") Fixes: 8c845e2731041f0f ("arm64/sve: Leave SVE enabled on syscall if we don't context switch") Fixes: ef3be86021c3bdf3 ("KVM: arm64: Add save/restore support for FPMR") Reported-by: Eric Auger Reported-by: Wilco Dijkstra Reviewed-by: Mark Brown Tested-by: Mark Brown Tested-by: Eric Auger Acked-by: Will Deacon Cc: Catalin Marinas Cc: Florian Weimer Cc: Fuad Tabba Cc: Jeremy Linton Cc: Marc Zyngier Cc: Oliver Upton Cc: Paolo Bonzini Signed-off-by: Mark Rutland Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kernel/fpsimd.c | 25 ------------------------- arch/arm64/kvm/fpsimd.c | 35 ++++++++++------------------------- 2 files changed, 10 insertions(+), 50 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8c4c1a2186cc5..ec68d520b7ca7 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1694,31 +1694,6 @@ void fpsimd_signal_preserve_current_state(void) sve_to_fpsimd(current); } -/* - * Called by KVM when entering the guest. - */ -void fpsimd_kvm_prepare(void) -{ - if (!system_supports_sve()) - return; - - /* - * KVM does not save host SVE state since we can only enter - * the guest from a syscall so the ABI means that only the - * non-saved SVE state needs to be saved. If we have left - * SVE enabled for performance reasons then update the task - * state to be FPSIMD only. - */ - get_cpu_fpsimd_context(); - - if (test_and_clear_thread_flag(TIF_SVE)) { - sve_to_fpsimd(current); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - put_cpu_fpsimd_context(); -} - /* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4d3d1a2eb1570..ceeb0a4893aa7 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -54,16 +54,18 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (!system_supports_fpsimd()) return; - fpsimd_kvm_prepare(); - /* - * We will check TIF_FOREIGN_FPSTATE just before entering the - * guest in kvm_arch_vcpu_ctxflush_fp() and override this to - * FP_STATE_FREE if the flag set. + * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such + * that the host kernel is responsible for restoring this state upon + * return to userspace, and the hyp code doesn't need to save anything. + * + * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures + * that PSTATE.{SM,ZA} == {0,0}. */ - *host_data_ptr(fp_owner) = FP_STATE_HOST_OWNED; - *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); - *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); + fpsimd_save_and_flush_cpu_state(); + *host_data_ptr(fp_owner) = FP_STATE_FREE; + *host_data_ptr(fpsimd_state) = NULL; + *host_data_ptr(fpmr_ptr) = NULL; host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -73,23 +75,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) host_data_set_flag(HOST_SME_ENABLED); - - /* - * If PSTATE.SM is enabled then save any pending FP - * state and disable PSTATE.SM. If we leave PSTATE.SM - * enabled and the guest does not enable SME via - * CPACR_EL1.SMEN then operations that should be valid - * may generate SME traps from EL1 to EL1 which we - * can't intercept and which would confuse the guest. - * - * Do the same for PSTATE.ZA in the case where there - * is state in the registers which has not already - * been saved, this is very unlikely to happen. - */ - if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { - *host_data_ptr(fp_owner) = FP_STATE_FREE; - fpsimd_save_and_flush_cpu_state(); - } } /* -- GitLab From 8eca7f6d5100b6997df4f532090bc3f7e0203bef Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:20 +0000 Subject: [PATCH 561/989] KVM: arm64: Remove host FPSIMD saving for non-protected KVM Now that the host eagerly saves its own FPSIMD/SVE/SME state, non-protected KVM never needs to save the host FPSIMD/SVE/SME state, and the code to do this is never used. Protected KVM still needs to save/restore the host FPSIMD/SVE state to avoid leaking guest state to the host (and to avoid revealing to the host whether the guest used FPSIMD/SVE/SME), and that code needs to be retained. Remove the unused code and data structures. To avoid the need for a stub copy of kvm_hyp_save_fpsimd_host() in the VHE hyp code, the nVHE/hVHE version is moved into the shared switch header, where it is only invoked when KVM is in protected mode. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 20 +++++------------- arch/arm64/kvm/arm.c | 8 ------- arch/arm64/kvm/fpsimd.c | 2 -- arch/arm64/kvm/hyp/include/hyp/switch.h | 25 ++++++++++++++++++++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 28 ------------------------- arch/arm64/kvm/hyp/vhe/switch.c | 8 ------- 7 files changed, 29 insertions(+), 64 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7cfa024de4e34..f56c07568591f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -624,23 +624,13 @@ struct kvm_host_data { struct kvm_cpu_context host_ctxt; /* - * All pointers in this union are hyp VA. + * Hyp VA. * sve_state is only used in pKVM and if system_supports_sve(). */ - union { - struct user_fpsimd_state *fpsimd_state; - struct cpu_sve_state *sve_state; - }; - - union { - /* HYP VA pointer to the host storage for FPMR */ - u64 *fpmr_ptr; - /* - * Used by pKVM only, as it needs to provide storage - * for the host - */ - u64 fpmr; - }; + struct cpu_sve_state *sve_state; + + /* Used by pKVM only. */ + u64 fpmr; /* Ownership of the FP regs */ enum { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 62c650c2f7b67..4b7389ad94f55 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2481,14 +2481,6 @@ static void finalize_init_hyp_mode(void) per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); } - } else { - for_each_possible_cpu(cpu) { - struct user_fpsimd_state *fpsimd_state; - - fpsimd_state = &per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->host_ctxt.fp_regs; - per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->fpsimd_state = - kern_hyp_va(fpsimd_state); - } } } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index ceeb0a4893aa7..332cb3904e68b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -64,8 +64,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) */ fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - *host_data_ptr(fpsimd_state) = NULL; - *host_data_ptr(fpmr_ptr) = NULL; host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f838a45665f26..c5b8a11ac4f50 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -375,7 +375,28 @@ static inline void __hyp_sve_save_host(void) true); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu); +static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) +{ + /* + * Non-protected kvm relies on the host restoring its sve state. + * Protected kvm restores the host's sve state as not to reveal that + * fpsimd was used by a guest nor leak upper sve bits. + */ + if (system_supports_sve()) { + __hyp_sve_save_host(); + + /* Re-enable SVE traps if not supported for the guest vcpu. */ + if (!vcpu_has_sve(vcpu)) + cpacr_clear_set(CPACR_EL1_ZEN, 0); + + } else { + __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); + } + + if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) + *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR); +} + /* * We trap the first access to the FP/SIMD to save the host context and @@ -425,7 +446,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) isb(); /* Write out the host state if it's in the registers */ - if (host_owns_fp_regs()) + if (is_protected_kvm_enabled() && host_owns_fp_regs()) kvm_hyp_save_fpsimd_host(vcpu); /* Restore the guest state */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6e12c070832f7..1a334a38d8fd2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -83,7 +83,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (system_supports_sve()) __hyp_sve_restore_host(); else - __fpsimd_restore_state(*host_data_ptr(fpsimd_state)); + __fpsimd_restore_state(host_data_ptr(host_ctxt.fp_regs)); if (has_fpmr) write_sysreg_s(*host_data_ptr(fpmr), SYS_FPMR); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6c846d033d24a..7a2d189176249 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -192,34 +192,6 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) kvm_handle_pvm_sysreg(vcpu, exit_code)); } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - /* - * Non-protected kvm relies on the host restoring its sve state. - * Protected kvm restores the host's sve state as not to reveal that - * fpsimd was used by a guest nor leak upper sve bits. - */ - if (unlikely(is_protected_kvm_enabled() && system_supports_sve())) { - __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - - } else { - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - } - - if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm))) { - u64 val = read_sysreg_s(SYS_FPMR); - - if (unlikely(is_protected_kvm_enabled())) - *host_data_ptr(fpmr) = val; - else - **host_data_ptr(fpmr_ptr) = val; - } -} - static const exit_handler_fn hyp_exit_handlers[] = { [0 ... ESR_ELx_EC_MAX] = NULL, [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b5b9dbaf1fdd6..e8a07d4bb546b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -413,14 +413,6 @@ static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } -static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) -{ - __fpsimd_save_state(*host_data_ptr(fpsimd_state)); - - if (kvm_has_fpmr(vcpu->kvm)) - **host_data_ptr(fpmr_ptr) = read_sysreg_s(SYS_FPMR); -} - static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code) { int ret = -EINVAL; -- GitLab From 459f059be702056d91537b99a129994aa6ccdd35 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:21 +0000 Subject: [PATCH 562/989] KVM: arm64: Remove VHE host restore of CPACR_EL1.ZEN When KVM is in VHE mode, the host kernel tries to save and restore the configuration of CPACR_EL1.ZEN (i.e. CPTR_EL2.ZEN when HCR_EL2.E2H=1) across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the configuration may be clobbered by hyp when running a vCPU. This logic is currently redundant. The VHE hyp code unconditionally configures CPTR_EL2.ZEN to 0b01 when returning to the host, permitting host kernel usage of SVE. Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME state, there's no need to save/restore the state of the EL0 SVE trap. The kernel can safely save/restore state without trapping, as described above, and will restore userspace state (including trap controls) before returning to userspace. Remove the redundant logic. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/fpsimd.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f56c07568591f..ed6841bf21b22 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -615,7 +615,6 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 332cb3904e68b..4ff0dee1a403f 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,10 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - host_data_clear_flag(HOST_SVE_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - host_data_set_flag(HOST_SVE_ENABLED); - if (system_supports_sme()) { host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) @@ -202,18 +198,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * when needed. */ fpsimd_save_and_flush_cpu_state(); - } else if (has_vhe() && system_supports_sve()) { - /* - * The FPSIMD/SVE state in the CPU has not been touched, and we - * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE - * for EL0. To avoid spurious traps, restore the trap state - * seen by kvm_arch_vcpu_load_fp(): - */ - if (host_data_test_flag(HOST_SVE_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); - else - sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } local_irq_restore(flags); -- GitLab From 407a99c4654e8ea65393f412c421a55cac539f5b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:22 +0000 Subject: [PATCH 563/989] KVM: arm64: Remove VHE host restore of CPACR_EL1.SMEN When KVM is in VHE mode, the host kernel tries to save and restore the configuration of CPACR_EL1.SMEN (i.e. CPTR_EL2.SMEN when HCR_EL2.E2H=1) across kvm_arch_vcpu_load_fp() and kvm_arch_vcpu_put_fp(), since the configuration may be clobbered by hyp when running a vCPU. This logic has historically been broken, and is currently redundant. This logic was originally introduced in commit: 861262ab86270206 ("KVM: arm64: Handle SME host state when running guests") At the time, the VHE hyp code would reset CPTR_EL2.SMEN to 0b00 when returning to the host, trapping host access to SME state. Unfortunately, this was unsafe as the host could take a softirq before calling kvm_arch_vcpu_put_fp(), and if a softirq handler were to use kernel mode NEON the resulting attempt to save the live FPSIMD/SVE/SME state would result in a fatal trap. That issue was limited to VHE mode. For nVHE/hVHE modes, KVM always saved/restored the host kernel's CPACR_EL1 value, and configured CPTR_EL2.TSM to 0b0, ensuring that host usage of SME would not be trapped. The issue above was incidentally fixed by commit: 375110ab51dec5dc ("KVM: arm64: Fix resetting SME trap values on reset for (h)VHE") That commit changed the VHE hyp code to configure CPTR_EL2.SMEN to 0b01 when returning to the host, permitting host kernel usage of SME, avoiding the issue described above. At the time, this was not identified as a fix for commit 861262ab86270206. Now that the host eagerly saves and unbinds its own FPSIMD/SVE/SME state, there's no need to save/restore the state of the EL0 SME trap. The kernel can safely save/restore state without trapping, as described above, and will restore userspace state (including trap controls) before returning to userspace. Remove the redundant logic. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-5-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/fpsimd.c | 21 --------------------- 2 files changed, 22 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed6841bf21b22..c77acc9904576 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -615,7 +615,6 @@ struct cpu_sve_state { struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_SPE 0 #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 -#define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 #define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 #define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 unsigned long flags; diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4ff0dee1a403f..f64724197958e 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,12 +65,6 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - if (system_supports_sme()) { - host_data_clear_flag(HOST_SME_ENABLED); - if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - host_data_set_flag(HOST_SME_ENABLED); - } - /* * If normal guests gain SME support, maintain this behavior for pKVM * guests, which don't support SME. @@ -141,21 +135,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - /* - * If we have VHE then the Hyp code will reset CPACR_EL1 to - * the default value and we need to reenable SME. - */ - if (has_vhe() && system_supports_sme()) { - /* Also restore EL0 state seen on entry */ - if (host_data_test_flag(HOST_SME_ENABLED)) - sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN); - else - sysreg_clear_set(CPACR_EL1, - CPACR_EL1_SMEN_EL0EN, - CPACR_EL1_SMEN_EL1EN); - isb(); - } - if (guest_owns_fp_regs()) { if (vcpu_has_sve(vcpu)) { u64 zcr = read_sysreg_el1(SYS_ZCR); -- GitLab From ee14db31a9c84e65f5adfd45598760d851f1d817 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:23 +0000 Subject: [PATCH 564/989] KVM: arm64: Refactor CPTR trap deactivation For historical reasons, the VHE and nVHE/hVHE implementations of __activate_cptr_traps() pair with a common implementation of __kvm_reset_cptr_el2(), which ideally would be named __deactivate_cptr_traps(). Rename __kvm_reset_cptr_el2() to __deactivate_cptr_traps(), and split it into separate VHE and nVHE/hVHE variants so that each can be paired with its corresponding implementation of __activate_cptr_traps(). At the same time, fold kvm_write_cptr_el2() into its callers. This makes it clear in-context whether a write is made to the CPACR_EL1 encoding or the CPTR_EL2 encoding, and removes the possibility of confusion as to whether kvm_write_cptr_el2() reformats the sysreg fields as cpacr_clear_set() does. In the nVHE/hVHE implementation of __activate_cptr_traps(), placing the sysreg writes within the if-else blocks requires that the call to __activate_traps_fpsimd32() is moved earlier, but as this was always called before writing to CPTR_EL2/CPACR_EL1, this should not result in a functional change. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-6-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 42 ---------------------------- arch/arm64/kvm/hyp/nvhe/switch.c | 35 ++++++++++++++++++++--- arch/arm64/kvm/hyp/vhe/switch.c | 12 +++++++- 3 files changed, 42 insertions(+), 47 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 47f2cf408eeda..78ec1ef2cfe82 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -605,48 +605,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) __cpacr_to_cptr_set(clr, set));\ } while (0) -static __always_inline void kvm_write_cptr_el2(u64 val) -{ - if (has_vhe() || has_hvhe()) - write_sysreg(val, cpacr_el1); - else - write_sysreg(val, cptr_el2); -} - -/* Resets the value of cptr_el2 when returning to the host. */ -static __always_inline void __kvm_reset_cptr_el2(struct kvm *kvm) -{ - u64 val; - - if (has_vhe()) { - val = (CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN); - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - } else if (has_hvhe()) { - val = CPACR_EL1_FPEN; - - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - } else { - val = CPTR_NVHE_EL2_RES1; - - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - } - - kvm_write_cptr_el2(val); -} - -#ifdef __KVM_NVHE_HYPERVISOR__ -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2(kern_hyp_va((v)->kvm)) -#else -#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2((v)->kvm) -#endif - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 7a2d189176249..5d79f63a4f861 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -39,6 +39,9 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + if (has_hvhe()) { val |= CPACR_EL1_TTA; @@ -47,6 +50,8 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN; } + + write_sysreg(val, cpacr_el1); } else { val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; @@ -61,12 +66,34 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); } +} - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); - kvm_write_cptr_el2(val); + if (has_hvhe()) { + u64 val = CPACR_EL1_FPEN; + + if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); + } else { + u64 val = CPTR_NVHE_EL2_RES1; + + if (kvm_has_sve(kvm) && guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); + } } static void __activate_traps(struct kvm_vcpu *vcpu) @@ -119,7 +146,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index e8a07d4bb546b..4748b1947ffa0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -136,6 +136,16 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) write_sysreg(val, cpacr_el1); } +static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; + + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN_EL1EN; + + write_sysreg(val, cpacr_el1); +} + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -207,7 +217,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - kvm_reset_cptr_el2(vcpu); + __deactivate_cptr_traps(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); -- GitLab From 9b66195063c5a145843547b1d692bd189be85287 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:24 +0000 Subject: [PATCH 565/989] KVM: arm64: Refactor exit handlers The hyp exit handling logic is largely shared between VHE and nVHE/hVHE, with common logic in arch/arm64/kvm/hyp/include/hyp/switch.h. The code in the header depends on function definitions provided by arch/arm64/kvm/hyp/vhe/switch.c and arch/arm64/kvm/hyp/nvhe/switch.c when they include the header. This is an unusual header dependency, and prevents the use of arch/arm64/kvm/hyp/include/hyp/switch.h in other files as this would result in compiler warnings regarding missing definitions, e.g. | In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: | ./arch/arm64/kvm/hyp/include/hyp/switch.h:733:31: warning: 'kvm_get_exit_handler_array' used but never defined | 733 | static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); | | ^~~~~~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:735:13: warning: 'early_exit_filter' used but never defined | 735 | static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); | | ^~~~~~~~~~~~~~~~~ Refactor the logic such that the header doesn't depend on anything from the C files. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-7-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 30 +++++-------------------- arch/arm64/kvm/hyp/nvhe/switch.c | 28 +++++++++++++---------- arch/arm64/kvm/hyp/vhe/switch.c | 9 ++++---- 3 files changed, 26 insertions(+), 41 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index c5b8a11ac4f50..46df5c2eeaf57 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -679,23 +679,16 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); - -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); - /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * * Returns true if the hypervisor handled the exit, and control should go back * to the guest, or false if it hasn't. */ -static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); - exit_handler_fn fn; - - fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; - + exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; if (fn) return fn(vcpu, exit_code); @@ -725,20 +718,9 @@ static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code * the guest, false when we should restore the host state and return to the * main run loop. */ -static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code, + const exit_handler_fn *handlers) { - /* - * Save PSTATE early so that we can evaluate the vcpu mode - * early on. - */ - synchronize_vcpu_pstate(vcpu, exit_code); - - /* - * Check whether we want to repaint the state one way or - * another. - */ - early_exit_filter(vcpu, exit_code); - if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); @@ -768,7 +750,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) goto exit; /* Check if there's an exit handler and allow it to handle the exit. */ - if (kvm_hyp_handle_exit(vcpu, exit_code)) + if (kvm_hyp_handle_exit(vcpu, exit_code, handlers)) goto guest; exit: /* Return to the host kernel and handle the exit */ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 5d79f63a4f861..69d7d3b4294a7 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -250,19 +250,21 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } -/* - * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. - * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a - * guest from dropping to AArch32 EL0 if implemented by the CPU. If the - * hypervisor spots a guest in such a state ensure it is handled, and don't - * trust the host to spot or fix it. The check below is based on the one in - * kvm_arch_vcpu_ioctl_run(). - * - * Returns false if the guest ran in AArch32 when it shouldn't have, and - * thus should exit to the host, or true if a the guest run loop can continue. - */ -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + + synchronize_vcpu_pstate(vcpu, exit_code); + + /* + * Some guests (e.g., protected VMs) are not be allowed to run in + * AArch32. The ARMv8 architecture does not give the hypervisor a + * mechanism to prevent a guest from dropping to AArch32 EL0 if + * implemented by the CPU. If the hypervisor spots a guest in such a + * state ensure it is handled, and don't trust the host to spot or fix + * it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + */ if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) { /* * As we have caught the guest red-handed, decide that it isn't @@ -275,6 +277,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; } + + return __fixup_guest_exit(vcpu, exit_code, handlers); } /* Switch to the guest for legacy non-VHE systems */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 4748b1947ffa0..c854d84458892 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -540,13 +540,10 @@ static const exit_handler_fn hyp_exit_handlers[] = { [ESR_ELx_EC_MOPS] = kvm_hyp_handle_mops, }; -static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { - return hyp_exit_handlers; -} + synchronize_vcpu_pstate(vcpu, exit_code); -static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) -{ /* * If we were in HYP context on entry, adjust the PSTATE view * so that the usual helpers work correctly. @@ -566,6 +563,8 @@ static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT); *vcpu_cpsr(vcpu) |= mode; } + + return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers); } /* Switch to the guest for VHE systems running in EL2 */ -- GitLab From f9dd00de1e53a47763dfad601635d18542c3836d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:25 +0000 Subject: [PATCH 566/989] KVM: arm64: Mark some header functions as inline The shared hyp switch header has a number of static functions which might not be used by all files that include the header, and when unused they will provoke compiler warnings, e.g. | In file included from arch/arm64/kvm/hyp/nvhe/hyp-main.c:8: | ./arch/arm64/kvm/hyp/include/hyp/switch.h:703:13: warning: 'kvm_hyp_handle_dabt_low' defined but not used [-Wunused-function] | 703 | static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:682:13: warning: 'kvm_hyp_handle_cp15_32' defined but not used [-Wunused-function] | 682 | static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:662:13: warning: 'kvm_hyp_handle_sysreg' defined but not used [-Wunused-function] | 662 | static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:458:13: warning: 'kvm_hyp_handle_fpsimd' defined but not used [-Wunused-function] | 458 | static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~~~ | ./arch/arm64/kvm/hyp/include/hyp/switch.h:329:13: warning: 'kvm_hyp_handle_mops' defined but not used [-Wunused-function] | 329 | static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) | | ^~~~~~~~~~~~~~~~~~~ Mark these functions as 'inline' to suppress this warning. This shouldn't result in any functional change. At the same time, avoid the use of __alias() in the header and alias kvm_hyp_handle_iabt_low() and kvm_hyp_handle_watchpt_low() to kvm_hyp_handle_memory_fault() using CPP, matching the style in the rest of the kernel. For consistency, kvm_hyp_handle_memory_fault() is also marked as 'inline'. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Acked-by: Will Deacon Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-8-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 46df5c2eeaf57..163867f7f7c52 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -326,7 +326,7 @@ static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } -static bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) { *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2); @@ -404,7 +404,7 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) * If FP/SIMD is not implemented, handle the trap and inject an undefined * instruction exception to the guest. Similarly for trapped SVE accesses. */ -static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest; u8 esr_ec; @@ -608,7 +608,7 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; } -static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) @@ -628,7 +628,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) { if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) @@ -637,19 +637,18 @@ static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } -static bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu, + u64 *exit_code) { if (!__populate_fault_info(vcpu)) return true; return false; } -static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); -static bool kvm_hyp_handle_watchpt_low(struct kvm_vcpu *vcpu, u64 *exit_code) - __alias(kvm_hyp_handle_memory_fault); +#define kvm_hyp_handle_iabt_low kvm_hyp_handle_memory_fault +#define kvm_hyp_handle_watchpt_low kvm_hyp_handle_memory_fault -static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) { if (kvm_hyp_handle_memory_fault(vcpu, exit_code)) return true; -- GitLab From 59419f10045bc955d2229819c7cf7a8b0b9c5b59 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 10 Feb 2025 19:52:26 +0000 Subject: [PATCH 567/989] KVM: arm64: Eagerly switch ZCR_EL{1,2} In non-protected KVM modes, while the guest FPSIMD/SVE/SME state is live on the CPU, the host's active SVE VL may differ from the guest's maximum SVE VL: * For VHE hosts, when a VM uses NV, ZCR_EL2 contains a value constrained by the guest hypervisor, which may be less than or equal to that guest's maximum VL. Note: in this case the value of ZCR_EL1 is immaterial due to E2H. * For nVHE/hVHE hosts, ZCR_EL1 contains a value written by the guest, which may be less than or greater than the guest's maximum VL. Note: in this case hyp code traps host SVE usage and lazily restores ZCR_EL2 to the host's maximum VL, which may be greater than the guest's maximum VL. This can be the case between exiting a guest and kvm_arch_vcpu_put_fp(). If a softirq is taken during this period and the softirq handler tries to use kernel-mode NEON, then the kernel will fail to save the guest's FPSIMD/SVE state, and will pend a SIGKILL for the current thread. This happens because kvm_arch_vcpu_ctxsync_fp() binds the guest's live FPSIMD/SVE state with the guest's maximum SVE VL, and fpsimd_save_user_state() verifies that the live SVE VL is as expected before attempting to save the register state: | if (WARN_ON(sve_get_vl() != vl)) { | force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); | return; | } Fix this and make this a bit easier to reason about by always eagerly switching ZCR_EL{1,2} at hyp during guest<->host transitions. With this happening, there's no need to trap host SVE usage, and the nVHE/nVHE __deactivate_cptr_traps() logic can be simplified to enable host access to all present FPSIMD/SVE/SME features. In protected nVHE/hVHE modes, the host's state is always saved/restored by hyp, and the guest's state is saved prior to exit to the host, so from the host's PoV the guest never has live FPSIMD/SVE/SME state, and the host's ZCR_EL1 is never clobbered by hyp. Fixes: 8c8010d69c132273 ("KVM: arm64: Save/restore SVE state for nVHE") Fixes: 2e3cf82063a00ea0 ("KVM: arm64: nv: Ensure correct VL is loaded before saving SVE state") Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Tested-by: Mark Brown Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250210195226.1215254-9-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 30 ------------- arch/arm64/kvm/hyp/entry.S | 5 +++ arch/arm64/kvm/hyp/include/hyp/switch.h | 59 +++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 13 +++--- arch/arm64/kvm/hyp/nvhe/switch.c | 6 +-- arch/arm64/kvm/hyp/vhe/switch.c | 4 ++ 6 files changed, 76 insertions(+), 41 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index f64724197958e..3cbb999419af7 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -136,36 +136,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); if (guest_owns_fp_regs()) { - if (vcpu_has_sve(vcpu)) { - u64 zcr = read_sysreg_el1(SYS_ZCR); - - /* - * If the vCPU is in the hyp context then ZCR_EL1 is - * loaded with its vEL2 counterpart. - */ - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr; - - /* - * Restore the VL that was saved when bound to the CPU, - * which is the maximum VL for the guest. Because the - * layout of the data when saving the sve state depends - * on the VL, we need to use a consistent (i.e., the - * maximum) VL. - * Note that this means that at guest exit ZCR_EL1 is - * not necessarily the same as on guest entry. - * - * ZCR_EL2 holds the guest hypervisor's VL when running - * a nested guest, which could be smaller than the - * max for the vCPU. Similar to above, we first need to - * switch to a VL consistent with the layout of the - * vCPU's SVE state. KVM support for NV implies VHE, so - * using the ZCR_EL1 alias is safe. - */ - if (!has_vhe() || (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, - SYS_ZCR_EL1); - } - /* * Flush (save and invalidate) the fpsimd/sve state so that if * the host tries to use fpsimd/sve, it's not using stale data diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 4433a234aa9ba..9f4e8d68ab505 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,6 +44,11 @@ alternative_if ARM64_HAS_RAS_EXTN alternative_else_nop_endif mrs x1, isr_el1 cbz x1, 1f + + // Ensure that __guest_enter() always provides a context + // synchronization event so that callers don't need ISBs for anything + // that would usually be synchonized by the ERET. + isb mov x0, #ARM_EXCEPTION_IRQ ret diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 163867f7f7c52..f5e882a358e2d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -375,6 +375,65 @@ static inline void __hyp_sve_save_host(void) true); } +static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + if (vcpu_has_sve(vcpu)) { + /* A guest hypervisor may restrict the effective max VL. */ + if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) + zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); + else + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)); + write_sysreg_el1(zcr_el1, SYS_ZCR); + } +} + +static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) +{ + u64 zcr_el1, zcr_el2; + + if (!guest_owns_fp_regs()) + return; + + /* + * When the guest owns the FP regs, we know that guest+hyp traps for + * any FPSIMD/SVE/SME features exposed to the guest have been disabled + * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd() + * prior to __guest_entry(). As __guest_entry() guarantees a context + * synchronization event, we don't need an ISB here to avoid taking + * traps for anything that was exposed to the guest. + */ + if (vcpu_has_sve(vcpu)) { + zcr_el1 = read_sysreg_el1(SYS_ZCR); + __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + + /* + * The guest's state is always saved using the guest's max VL. + * Ensure that the host has the guest's max VL active such that + * the host can save the guest's state lazily, but don't + * artificially restrict the host to the guest's max VL. + */ + if (has_vhe()) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + } else { + zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1; + write_sysreg_el2(zcr_el2, SYS_ZCR); + + zcr_el1 = vcpu_sve_max_vq(vcpu) - 1; + write_sysreg_el1(zcr_el1, SYS_ZCR); + } + } +} + static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) { /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1a334a38d8fd2..2c37680d954cf 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -5,6 +5,7 @@ */ #include +#include #include #include @@ -224,8 +225,12 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) sync_hyp_vcpu(hyp_vcpu); } else { + struct kvm_vcpu *vcpu = kern_hyp_va(host_vcpu); + /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); + fpsimd_lazy_switch_to_guest(vcpu); + ret = __kvm_vcpu_run(vcpu); + fpsimd_lazy_switch_to_host(vcpu); } out: cpu_reg(host_ctxt, 1) = ret; @@ -675,12 +680,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SMC64: handle_host_smc(host_ctxt); break; - case ESR_ELx_EC_SVE: - cpacr_clear_set(0, CPACR_EL1_ZEN); - isb(); - sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, - SYS_ZCR_EL2); - break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: handle_host_mem_abort(host_ctxt); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 69d7d3b4294a7..7d2ba6ef02618 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -73,12 +73,10 @@ static void __activate_cptr_traps(struct kvm_vcpu *vcpu) static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) { - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - if (has_hvhe()) { u64 val = CPACR_EL1_FPEN; - if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) + if (cpus_have_final_cap(ARM64_SVE)) val |= CPACR_EL1_ZEN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_EL1_SMEN; @@ -87,7 +85,7 @@ static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) } else { u64 val = CPTR_NVHE_EL2_RES1; - if (kvm_has_sve(kvm) && guest_owns_fp_regs()) + if (!cpus_have_final_cap(ARM64_SVE)) val |= CPTR_EL2_TZ; if (!cpus_have_final_cap(ARM64_SME)) val |= CPTR_EL2_TSM; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c854d84458892..647737d6e8d0b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -579,6 +579,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_save_host_state_vhe(host_ctxt); + fpsimd_lazy_switch_to_guest(vcpu); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -603,6 +605,8 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + fpsimd_lazy_switch_to_host(vcpu); + sysreg_restore_host_state_vhe(host_ctxt); if (guest_owns_fp_regs()) -- GitLab From 332b7e6d62b7a3a988017f5184e547aa20e3a19a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 13 Feb 2025 09:15:31 +0000 Subject: [PATCH 568/989] KVM: arm64: Simplify warning in kvm_arch_vcpu_load_fp() At the end of kvm_arch_vcpu_load_fp() we check that no bits are set in SVCR. We only check this for protected mode despite this mattering equally for non-protected mode, and the comment above this is confusing. Remove the comment and simplify the check, moving from WARN_ON() to WARN_ON_ONCE() to avoid spamming the log. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 3cbb999419af7..7f6e43d256915 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,12 +65,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_save_and_flush_cpu_state(); *host_data_ptr(fp_owner) = FP_STATE_FREE; - /* - * If normal guests gain SME support, maintain this behavior for pKVM - * guests, which don't support SME. - */ - WARN_ON(is_protected_kvm_enabled() && system_supports_sme() && - read_sysreg_s(SYS_SVCR)); + WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR)); } /* -- GitLab From 65729da9ce37f5a2c62e2542ef03bc9ac6775a7d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 17:34:54 +0000 Subject: [PATCH 569/989] KVM: arm64: Convert timer offset VA when accessed in HYP code Now that EL2 has gained some early timer emulation, it accesses the offsets pointed to by the timer structure, both of which live in the KVM structure. Of course, these are *kernel* pointers, so the dereferencing of these pointers in non-kernel code must be itself be offset. Given switch.h its own version of timer_get_offset() and use that instead. Fixes: b86fc215dc26d ("KVM: arm64: Handle counter access early in non-HYP context") Reported-by: Linux Kernel Functional Testing Reviewed-by: Oliver Upton Tested-by: Anders Roxell Link: https://lore.kernel.org/r/20250212173454.2864462-1-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index f5e882a358e2d..23bbe28eaaf95 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -581,9 +581,22 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } +/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */ +static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (ctxt->offset.vm_offset) + offset += *kern_hyp_va(ctxt->offset.vm_offset); + if (ctxt->offset.vcpu_offset) + offset += *kern_hyp_va(ctxt->offset.vcpu_offset); + + return offset; +} + static inline u64 compute_counter_value(struct arch_timer_context *ctxt) { - return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); + return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt); } static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) -- GitLab From b938731ed2d4eea8e268a27bfc600581fedae2a9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 13 Feb 2025 15:36:14 +0000 Subject: [PATCH 570/989] KVM: arm64: Fix alignment of kvm_hyp_memcache allocations When allocating guest stage-2 page-table pages at EL2, pKVM can consume pages from the host-provided kvm_hyp_memcache. As pgtable.c expects zeroed pages, guest_s2_zalloc_page() actively implements this zeroing with a PAGE_SIZE memset. Unfortunately, we don't check the page alignment of the host-provided address before doing so, which could lead to the memset overrunning the page if the host was malicious. Fix this by simply force-aligning all kvm_hyp_memcache allocations to page boundaries. Fixes: 60dfe093ec13 ("KVM: arm64: Instantiate guest stage-2 page-tables at EL2") Reported-by: Ben Simner Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20250213153615.3642515-1-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c77acc9904576..3a7ec98ef1238 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -100,7 +100,7 @@ static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, void *(*to_va)(phys_addr_t phys)) { - phys_addr_t *p = to_va(mc->head); + phys_addr_t *p = to_va(mc->head & PAGE_MASK); if (!mc->nr_pages) return NULL; -- GitLab From 540cda75884a6ba4c289980c84392261b1f61a9c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 11:21:24 +0000 Subject: [PATCH 571/989] rxrpc: Fix ipv6 path MTU discovery rxrpc path MTU discovery currently only makes use of ICMPv4, but not ICMPv6, which means that pmtud for IPv6 doesn't work correctly. Fix it to check for ICMPv6 messages also. Fixes: eeaedc5449d9 ("rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/3517283.1739359284@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/peer_event.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index e874c31fa9012..bc283da9ee402 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -169,6 +169,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); -- GitLab From 488fb6effe03e20f38d34da7425de77bbd3e2665 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 12 Feb 2025 16:17:51 +0100 Subject: [PATCH 572/989] net: pse-pd: Fix deadlock in current limit functions Fix a deadlock in pse_pi_get_current_limit and pse_pi_set_current_limit caused by consecutive mutex_lock calls. One in the function itself and another in pse_pi_get_voltage. Resolve the issue by using the unlocked version of pse_pi_get_voltage instead. Fixes: e0a5e2bba38a ("net: pse-pd: Use power limit at driver side instead of current limit") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250212151751.1515008-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pse_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index 4f2a54afc4d09..4602e26eb8c86 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -319,7 +319,7 @@ static int pse_pi_get_current_limit(struct regulator_dev *rdev) goto out; mW = ret; - ret = pse_pi_get_voltage(rdev); + ret = _pse_pi_get_voltage(rdev); if (!ret) { dev_err(pcdev->dev, "Voltage null\n"); ret = -ERANGE; @@ -356,7 +356,7 @@ static int pse_pi_set_current_limit(struct regulator_dev *rdev, int min_uA, id = rdev_get_id(rdev); mutex_lock(&pcdev->lock); - ret = pse_pi_get_voltage(rdev); + ret = _pse_pi_get_voltage(rdev); if (!ret) { dev_err(pcdev->dev, "Voltage null\n"); ret = -ERANGE; -- GitLab From e6e3e0022ef8f1d584ee4d5b89dca02472c5eb1f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 18:25:57 +0000 Subject: [PATCH 573/989] KVM: arm64: timer: Drop warning on failed interrupt signalling We currently spit out a warning if making a timer interrupt pending fails. But not only this is loud and easy to trigger from userspace, we also fail to do anything useful with that information. Dropping the warning is the easiest thing to do for now. We can always add error reporting if we really want in the future. Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250212182558.2865232-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 231c0cd9c7b4b..70802e4c91cf5 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -447,21 +447,19 @@ static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { - int ret; - kvm_timer_update_status(timer_ctx, new_level); timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); - if (!userspace_irqchip(vcpu->kvm)) { - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - timer_irq(timer_ctx), - timer_ctx->irq.level, - timer_ctx); - WARN_ON(ret); - } + if (userspace_irqchip(vcpu->kvm)) + return; + + kvm_vgic_inject_irq(vcpu->kvm, vcpu, + timer_irq(timer_ctx), + timer_ctx->irq.level, + timer_ctx); } /* Only called for a fully emulated timer */ -- GitLab From b3aa9283c0c505b5cfd25f7d6cfd720de2adc807 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2025 18:25:58 +0000 Subject: [PATCH 574/989] KVM: arm64: vgic: Hoist SGI/PPI alloc from vgic_init() to kvm_create_vgic() If userspace creates vcpus, then a vgic, we end-up in a situation where irqchip_in_kernel() will return true, but no private interrupt has been allocated for these vcpus. This situation will continue until userspace initialises the vgic, at which point we fix the early vcpus. Should a vcpu run or be initialised in the interval, bad things may happen. An obvious solution is to move this fix-up phase to the point where the vgic is created. This ensures that from that point onwards, all vcpus have their private interrupts, as new vcpus will directly allocate them. With that, we have the invariant that when irqchip_in_kernel() is true, all vcpus have their private interrupts. Reported-by: Alexander Potapenko Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250212182558.2865232-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 74 ++++++++++++++++----------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index bc7e22ab5d812..775461cf2d2db 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -34,9 +34,9 @@ * * CPU Interface: * - * - kvm_vgic_vcpu_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. + * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend + * on any sizing information. Private interrupts are allocated if not + * already allocated at vgic-creation time. */ /* EARLY INIT */ @@ -58,6 +58,8 @@ void kvm_vgic_early_init(struct kvm *kvm) /* CREATION */ +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type); + /** * kvm_vgic_create: triggered by the instantiation of the VGIC device by * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only) @@ -112,6 +114,22 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + goto out_unlock; + } + kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; @@ -180,7 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) return 0; } -static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; int i; @@ -218,17 +236,28 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu) /* PPIs */ irq->config = VGIC_CONFIG_LEVEL; } + + switch (type) { + case KVM_DEV_TYPE_ARM_VGIC_V3: + irq->group = 1; + irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); + break; + case KVM_DEV_TYPE_ARM_VGIC_V2: + irq->group = 0; + irq->targets = BIT(vcpu->vcpu_id); + break; + } } return 0; } -static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu) +static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type) { int ret; mutex_lock(&vcpu->kvm->arch.config_lock); - ret = vgic_allocate_private_irqs_locked(vcpu); + ret = vgic_allocate_private_irqs_locked(vcpu, type); mutex_unlock(&vcpu->kvm->arch.config_lock); return ret; @@ -258,7 +287,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) return 0; - ret = vgic_allocate_private_irqs(vcpu); + ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model); if (ret) return ret; @@ -295,7 +324,7 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0; unsigned long idx; lockdep_assert_held(&kvm->arch.config_lock); @@ -315,35 +344,6 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; - /* Initialize groups on CPUs created before the VGIC type was known */ - kvm_for_each_vcpu(idx, vcpu, kvm) { - ret = vgic_allocate_private_irqs_locked(vcpu); - if (ret) - goto out; - - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i); - - switch (dist->vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - irq->group = 1; - irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu); - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - irq->group = 0; - irq->targets = 1U << idx; - break; - default: - ret = -EINVAL; - } - - vgic_put_irq(kvm, irq); - - if (ret) - goto out; - } - } - /* * If we have GICv4.1 enabled, unconditionally request enable the * v4 support so that we get HW-accelerated vSGIs. Otherwise, only -- GitLab From 320702a76186222426e5dc8efb9d68ba9d4ed0ab Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 13 Feb 2025 17:29:51 +0100 Subject: [PATCH 575/989] MAINTAINERS: delete entry for AXXIA I2C The maintainer's email address bounced and he wasn't active for 4 years. Delete this entry and fall back to the generic I2C host drivers entry. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20250213162950.45596-2-wsa+renesas@sang-engineering.com Signed-off-by: Andi Shyti --- MAINTAINERS | 7 ------- 1 file changed, 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353de..fc332fbf3958c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3859,13 +3859,6 @@ W: https://ez.analog.com/linux-software-drivers F: Documentation/devicetree/bindings/pwm/adi,axi-pwmgen.yaml F: drivers/pwm/pwm-axi-pwmgen.c -AXXIA I2C CONTROLLER -M: Krzysztof Adamski -L: linux-i2c@vger.kernel.org -S: Maintained -F: Documentation/devicetree/bindings/i2c/i2c-axxia.txt -F: drivers/i2c/busses/i2c-axxia.c - AZ6007 DVB DRIVER M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org -- GitLab From 7422c319fd805b956aab5ba93e0274517a8e3650 Mon Sep 17 00:00:00 2001 From: Mukesh Kumar Savaliya Date: Thu, 23 Jan 2025 14:11:47 +0530 Subject: [PATCH 576/989] MAINTAINERS: Add maintainer for Qualcomm's I2C GENI driver Add a new entry for the I2C QCOM GENI driver to the MAINTAINERS file. This entry includes the maintainer's name and contact information, ensuring proper maintainership and communication for the i2c-qcom-geni driver file. Signed-off-by: Mukesh Kumar Savaliya Link: https://lore.kernel.org/r/20250123084147.3632023-1-quic_msavaliy@quicinc.com Signed-off-by: Andi Shyti --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fc332fbf3958c..db7e533c466c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19503,6 +19503,15 @@ L: dmaengine@vger.kernel.org S: Supported F: drivers/dma/qcom/hidma* +QUALCOMM I2C QCOM GENI DRIVER +M: Mukesh Kumar Savaliya +M: Viken Dadhaniya +L: linux-i2c@vger.kernel.org +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/i2c/qcom,i2c-geni-qcom.yaml +F: drivers/i2c/busses/i2c-qcom-geni.c + QUALCOMM I2C CCI DRIVER M: Loic Poulain M: Robert Foss -- GitLab From ef75966abf950c0539534effa4960caa29fb7167 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 27 Jan 2025 09:44:11 +0000 Subject: [PATCH 577/989] iommu/amd: Expicitly enable CNTRL.EPHEn bit in resume path With recent kernel, AMDGPU failed to resume after suspend on certain laptop. Sample log: ----------- Nov 14 11:52:19 Thinkbook kernel: iommu ivhd0: AMD-Vi: Event logged [ILLEGAL_DEV_TABLE_ENTRY device=0000:06:00.0 pasid=0x00000 address=0x135300000 flags=0x0080] Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[0]: 7d90000000000003 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[1]: 0000100103fc0009 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[2]: 2000000117840013 Nov 14 11:52:19 Thinkbook kernel: AMD-Vi: DTE[3]: 0000000000000000 This is because in resume path, CNTRL[EPHEn] is not set. Fix this by setting CNTRL[EPHEn] to 1 in resume path if EFR[EPHSUP] is set. Note May be better approach is to save the control register in suspend path and restore it in resume path instead of trying to set indivisual bits. We will have separate patch for that. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219499 Fixes: c4cb23111103 ("iommu/amd: Add support for enable/disable IOPF") Tested-by: Hamish McIntyre-Bhatty Signed-off-by: Vasant Hegde Link: https://lore.kernel.org/r/20250127094411.5931-1-vasant.hegde@amd.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/init.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 0bbda60d3cdc7..23caea22f8dcd 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -175,6 +175,7 @@ #define CONTROL_GAM_EN 25 #define CONTROL_GALOG_EN 28 #define CONTROL_GAINT_EN 29 +#define CONTROL_EPH_EN 45 #define CONTROL_XT_EN 50 #define CONTROL_INTCAPXT_EN 51 #define CONTROL_IRTCACHEDIS 59 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c5cd92edada06..438848b0682fe 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -2653,6 +2653,10 @@ static void iommu_init_flags(struct amd_iommu *iommu) /* Set IOTLB invalidation timeout to 1s */ iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); + + /* Enable Enhanced Peripheral Page Request Handling */ + if (check_feature(FEATURE_EPHSUP)) + iommu_feature_enable(iommu, CONTROL_EPH_EN); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) -- GitLab From 78be7f04537fa35f6cc694879e9a475ca1984936 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 28 Jan 2025 19:05:21 +0000 Subject: [PATCH 578/989] iommu: Fix a spelling error Fix spelling error IDENITY -> IDENTITY in drivers/iommu/iommu.c. Signed-off-by: Easwar Hariharan Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250128190522.70800-1-eahariha@linux.microsoft.com [ joro: Add commit message ] Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 870c3cdbd0f62..60aed01e54f27 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1756,7 +1756,7 @@ static int iommu_get_def_domain_type(struct iommu_group *group, group->id); /* - * Try to recover, drivers are allowed to force IDENITY or DMA, IDENTITY + * Try to recover, drivers are allowed to force IDENTITY or DMA, IDENTITY * takes precedence. */ if (type == IOMMU_DOMAIN_IDENTITY) -- GitLab From 4a8991fe9cd0b6a509bab3d056700d3520601d86 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Mon, 10 Feb 2025 13:20:04 +0200 Subject: [PATCH 579/989] iommu/exynos: Fix typos There are some typos in comments/messages: - modyfying -> modifying - Unabled -> Unable Fix them via codespell. Signed-off-by: Andrew Kreimer Link: https://lore.kernel.org/r/20250210112027.29791-1-algonell@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index c666ecab955d2..69e23e017d9e5 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -249,7 +249,7 @@ struct exynos_iommu_domain { struct list_head clients; /* list of sysmmu_drvdata.domain_node */ sysmmu_pte_t *pgtable; /* lv1 page table, 16KB */ short *lv2entcnt; /* free lv2 entry counter for each section */ - spinlock_t lock; /* lock for modyfying list of clients */ + spinlock_t lock; /* lock for modifying list of clients */ spinlock_t pgtablelock; /* lock for modifying page table @ pgtable */ struct iommu_domain domain; /* generic domain data structure */ }; @@ -292,7 +292,7 @@ struct sysmmu_drvdata { struct clk *aclk; /* SYSMMU's aclk clock */ struct clk *pclk; /* SYSMMU's pclk clock */ struct clk *clk_master; /* master's device clock */ - spinlock_t lock; /* lock for modyfying state */ + spinlock_t lock; /* lock for modifying state */ bool active; /* current status */ struct exynos_iommu_domain *domain; /* domain we belong to */ struct list_head domain_node; /* node for domain clients list */ @@ -746,7 +746,7 @@ static int exynos_sysmmu_probe(struct platform_device *pdev) ret = devm_request_irq(dev, irq, exynos_sysmmu_irq, 0, dev_name(dev), data); if (ret) { - dev_err(dev, "Unabled to register handler of irq %d\n", irq); + dev_err(dev, "Unable to register handler of irq %d\n", irq); return ret; } -- GitLab From add43c4fbc92f8b48c1acd64e953af3b1be4cd9c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 11 Feb 2025 08:55:12 +0800 Subject: [PATCH 580/989] iommu/vt-d: Make intel_iommu_drain_pasid_prq() cover faults for RID This driver supports page faults on PCI RID since commit <9f831c16c69e> ("iommu/vt-d: Remove the pasid present check in prq_event_thread") by allowing the reporting of page faults with the pasid_present field cleared to the upper layer for further handling. The fundamental assumption here is that the detach or replace operations act as a fence for page faults. This implies that all pending page faults associated with a specific RID or PASID are flushed when a domain is detached or replaced from a device RID or PASID. However, the intel_iommu_drain_pasid_prq() helper does not correctly handle faults for RID. This leads to faults potentially remaining pending in the iommu hardware queue even after the domain is detached, thereby violating the aforementioned assumption. Fix this issue by extending intel_iommu_drain_pasid_prq() to cover faults for RID. Fixes: 9f831c16c69e ("iommu/vt-d: Remove the pasid present check in prq_event_thread") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250121023150.815972-1-baolu.lu@linux.intel.com Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20250211005512.985563-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/prq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c index c2d792db52c3e..064194399b38b 100644 --- a/drivers/iommu/intel/prq.c +++ b/drivers/iommu/intel/prq.c @@ -87,7 +87,9 @@ void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) struct page_req_dsc *req; req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { + if (req->rid != sid || + (req->pasid_present && pasid != req->pasid) || + (!req->pasid_present && pasid != IOMMU_NO_PASID)) { head = (head + sizeof(*req)) & PRQ_RING_MASK; continue; } -- GitLab From e71f7f42e3c874ac3314b8f250e8416a706165af Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 2 Feb 2025 20:49:35 +0800 Subject: [PATCH 581/989] USB: pci-quirks: Fix HCCPARAMS register error for LS7A EHCI LS7A EHCI controller doesn't have extended capabilities, so the EECP (EHCI Extended Capabilities Pointer) field of HCCPARAMS register should be 0x0, but it reads as 0xa0 now. This is a hardware flaw and will be fixed in future, now just clear the EECP field to avoid error messages on boot: ...... [ 0.581675] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581699] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581716] pci 0000:00:04.1: EHCI: unrecognized capability ff [ 0.581851] pci 0000:00:04.1: EHCI: unrecognized capability ff ...... [ 0.581916] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.581951] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.582704] pci 0000:00:05.1: EHCI: unrecognized capability ff [ 0.582799] pci 0000:00:05.1: EHCI: unrecognized capability ff ...... Cc: stable Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen Link: https://lore.kernel.org/r/20250202124935.480500-1-chenhuacai@loongson.cn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 1f9c1b1435d86..0404489c2f6a9 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -958,6 +958,15 @@ static void quirk_usb_disable_ehci(struct pci_dev *pdev) * booting from USB disk or using a usb keyboard */ hcc_params = readl(base + EHCI_HCC_PARAMS); + + /* LS7A EHCI controller doesn't have extended capabilities, the + * EECP (EHCI Extended Capabilities Pointer) field of HCCPARAMS + * register should be 0x0 but it reads as 0xa0. So clear it to + * avoid error messages on boot. + */ + if (pdev->vendor == PCI_VENDOR_ID_LOONGSON && pdev->device == 0x7a14) + hcc_params &= ~(0xffL << 8); + offset = (hcc_params >> 8) & 0xff; while (offset && --count) { pci_read_config_dword(pdev, offset, &cap); -- GitLab From c81d9fcd5b9402166048f377d4e5e0ee6f9ef26d Mon Sep 17 00:00:00 2001 From: Michal Pecio Date: Tue, 28 Jan 2025 10:45:29 +0100 Subject: [PATCH 582/989] usb: xhci: Restore xhci_pci support for Renesas HCs Some Renesas HCs require firmware upload to work, this is handled by the xhci_pci_renesas driver. Other variants of those chips load firmware from a SPI flash and are ready to work with xhci_pci alone. A refactor merged in v6.12 broke the latter configuration so that users are finding their hardware ignored by the normal driver and are forced to enable the firmware loader which isn't really necessary on their systems. Let xhci_pci work with those chips as before when the firmware loader is disabled by kernel configuration. Fixes: 25f51b76f90f ("xhci-pci: Make xhci-pci-renesas a proper modular driver") Cc: stable Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219616 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219726 Signed-off-by: Michal Pecio Tested-by: Nicolai Buchwitz Link: https://lore.kernel.org/r/20250128104529.58a79bfc@foxbook Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 2d1e205c14c60..ad0ff356f6fa0 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -653,8 +653,8 @@ int xhci_pci_common_probe(struct pci_dev *dev, const struct pci_device_id *id) } EXPORT_SYMBOL_NS_GPL(xhci_pci_common_probe, "xhci"); -static const struct pci_device_id pci_ids_reject[] = { - /* handled by xhci-pci-renesas */ +/* handled by xhci-pci-renesas if enabled */ +static const struct pci_device_id pci_ids_renesas[] = { { PCI_DEVICE(PCI_VENDOR_ID_RENESAS, 0x0014) }, { PCI_DEVICE(PCI_VENDOR_ID_RENESAS, 0x0015) }, { /* end: all zeroes */ } @@ -662,7 +662,8 @@ static const struct pci_device_id pci_ids_reject[] = { static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { - if (pci_match_id(pci_ids_reject, dev)) + if (IS_ENABLED(CONFIG_USB_XHCI_PCI_RENESAS) && + pci_match_id(pci_ids_renesas, dev)) return -ENODEV; return xhci_pci_common_probe(dev, id); -- GitLab From e563b01208f4d1f609bcab13333b6c0e24ce6a01 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 12 Feb 2025 19:15:15 +0100 Subject: [PATCH 583/989] usb: cdc-acm: Check control transfer buffer size before access If the first fragment is shorter than struct usb_cdc_notification, we can't calculate an expected_size. Log an error and discard the notification instead of reading lengths from memory outside the received data, which can lead to memory corruption when the expected_size decreases between fragments, causing `expected_size - acm->nb_index` to wrap. This issue has been present since the beginning of git history; however, it only leads to memory corruption since commit ea2583529cd1 ("cdc-acm: reassemble fragmented notifications"). A mitigating factor is that acm_ctrl_irq() can only execute after userspace has opened /dev/ttyACM*; but if ModemManager is running, ModemManager will do that automatically depending on the USB device's vendor/product IDs and its other interfaces. Cc: stable Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Jann Horn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 6b37d1c47fce1..39c7db7bcd216 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -371,7 +371,7 @@ static void acm_process_notification(struct acm *acm, unsigned char *buf) static void acm_ctrl_irq(struct urb *urb) { struct acm *acm = urb->context; - struct usb_cdc_notification *dr = urb->transfer_buffer; + struct usb_cdc_notification *dr; unsigned int current_size = urb->actual_length; unsigned int expected_size, copy_size, alloc_size; int retval; @@ -398,9 +398,20 @@ static void acm_ctrl_irq(struct urb *urb) usb_mark_last_busy(acm->dev); - if (acm->nb_index) + if (acm->nb_index == 0) { + /* + * The first chunk of a message must contain at least the + * notification header with the length field, otherwise we + * can't get an expected_size. + */ + if (current_size < sizeof(struct usb_cdc_notification)) { + dev_dbg(&acm->control->dev, "urb too short\n"); + goto exit; + } + dr = urb->transfer_buffer; + } else { dr = (struct usb_cdc_notification *)acm->notification_buffer; - + } /* size = notification-header + (optional) data */ expected_size = sizeof(struct usb_cdc_notification) + le16_to_cpu(dr->wLength); -- GitLab From 12e712964f41d05ae034989892de445781c46730 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 12 Feb 2025 19:15:16 +0100 Subject: [PATCH 584/989] usb: cdc-acm: Fix handling of oversized fragments If we receive an initial fragment of size 8 bytes which specifies a wLength of 1 byte (so the reassembled message is supposed to be 9 bytes long), and we then receive a second fragment of size 9 bytes (which is not supposed to happen), we currently wrongly bypass the fragment reassembly code but still pass the pointer to the acm->notification_buffer to acm_process_notification(). Make this less wrong by always going through fragment reassembly when we expect more fragments. Before this patch, receiving an overlong fragment could lead to `newctrl` in acm_process_notification() being uninitialized data (instead of data coming from the device). Cc: stable Fixes: ea2583529cd1 ("cdc-acm: reassemble fragmented notifications") Signed-off-by: Jann Horn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index 39c7db7bcd216..c70f349936238 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -416,7 +416,7 @@ static void acm_ctrl_irq(struct urb *urb) expected_size = sizeof(struct usb_cdc_notification) + le16_to_cpu(dr->wLength); - if (current_size < expected_size) { + if (acm->nb_index != 0 || current_size < expected_size) { /* notification is transmitted fragmented, reassemble */ if (acm->nb_size < expected_size) { u8 *new_buffer; -- GitLab From 7284922f3e4fa285dff1b8bb593aa9a0b8458f30 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 9 Feb 2025 15:56:11 +0100 Subject: [PATCH 585/989] USB: cdc-acm: Fill in Renesas R-Car D3 USB Download mode quirk Add Renesas R-Car D3 USB Download mode quirk and update comments on all the other Renesas R-Car USB Download mode quirks to discern them from each other. This follows R-Car Series, 3rd Generation reference manual Rev.2.00 chapter 19.2.8 USB download mode . Fixes: 6d853c9e4104 ("usb: cdc-acm: Add DISABLE_ECHO for Renesas USB Download mode") Cc: stable Signed-off-by: Marek Vasut Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250209145708.106914-1-marek.vasut+renesas@mailbox.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index c70f349936238..c2ecfa3c83496 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1738,13 +1738,16 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x0870, 0x0001), /* Metricom GS Modem */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, - { USB_DEVICE(0x045b, 0x023c), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x023c), /* Renesas R-Car H3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, - { USB_DEVICE(0x045b, 0x0248), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x0247), /* Renesas R-Car D3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, - { USB_DEVICE(0x045b, 0x024D), /* Renesas USB Download mode */ + { USB_DEVICE(0x045b, 0x0248), /* Renesas R-Car M3-N USB Download mode */ + .driver_info = DISABLE_ECHO, /* Don't echo banner */ + }, + { USB_DEVICE(0x045b, 0x024D), /* Renesas R-Car E3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x0e8d, 0x0003), /* FIREFLY, MediaTek Inc; andrey.arapov@gmail.com */ -- GitLab From 159daf1258227f44b26b5d38f4aa8f37b8cca663 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 6 Feb 2025 17:18:36 +0200 Subject: [PATCH 586/989] USB: Add USB_QUIRK_NO_LPM quirk for sony xperia xz1 smartphone The fastboot tool for communicating with Android bootloaders does not work reliably with this device if USB 2 Link Power Management (LPM) is enabled. Various fastboot commands are affected, including the following, which usually reproduces the problem within two tries: fastboot getvar kernel getvar:kernel FAILED (remote: 'GetVar Variable Not found') This issue was hidden on many systems up until commit 63a1f8454962 ("xhci: stored cached port capability values in one place") as the xhci driver failed to detect USB 2 LPM support if USB 3 ports were listed before USB 2 ports in the "supported protocol capabilities". Adding the quirk resolves the issue. No drawbacks are expected since the device uses different USB product IDs outside of fastboot mode, and since fastboot commands worked before, until LPM was enabled on the tested system by the aforementioned commit. Based on a patch from Forest from which most of the code and commit message is taken. Cc: stable Reported-by: Forest Closes: https://lore.kernel.org/hk8umj9lv4l4qguftdq1luqtdrpa1gks5l@sonic.net Tested-by: Forest Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250206151836.51742-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 67732c791c933..59ed9768dae1f 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -435,6 +435,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, + /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */ + { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM }, + /* Action Semiconductor flash disk */ { USB_DEVICE(0x10d6, 0x2200), .driver_info = USB_QUIRK_STRING_FETCH_255 }, -- GitLab From 89cb121e94612cd3bb3c74b0e772ead5b40b7a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Tue, 11 Feb 2025 14:25:28 +0100 Subject: [PATCH 587/989] selftests/landlock: Enable the new CONFIG_AF_UNIX_OOB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 5155cbcdbf03 ("af_unix: Add a prompt to CONFIG_AF_UNIX_OOB"), the Landlock selftests's configuration is not enough to build a minimal kernel. Because scoped_signal_test checks with the MSG_OOB flag, we need to enable CONFIG_AF_UNIX_OOB for tests: # RUN fown.no_sandbox.sigurg_socket ... # scoped_signal_test.c:420:sigurg_socket:Expected 1 (1) == send(client_socket, ".", 1, MSG_OOB) (-1) # sigurg_socket: Test terminated by assertion # FAIL fown.no_sandbox.sigurg_socket ... Cc: Günther Noack Acked-by: Florent Revest Link: https://lore.kernel.org/r/20250211132531.1625566-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 29af19c4e9f98..361f94f8cb0d4 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -1,3 +1,4 @@ +CONFIG_AF_UNIX_OOB=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y CONFIG_INET=y -- GitLab From 143c9aae043a1dc174a75be52521192a0caa224b Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Fri, 24 Jan 2025 01:12:10 +0530 Subject: [PATCH 588/989] landlock: Fix grammar error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix grammar error in comments that were identified using the codespell tool. Signed-off-by: Tanya Agarwal Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20250123194208.2660-1-tanyaagarwal25699@gmail.com [mic: Simplify commit message] Signed-off-by: Mickaël Salaün --- security/landlock/ruleset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/landlock/ruleset.c b/security/landlock/ruleset.c index 241ce44375b6a..bff4e40a3093c 100644 --- a/security/landlock/ruleset.c +++ b/security/landlock/ruleset.c @@ -124,7 +124,7 @@ create_rule(const struct landlock_id id, return ERR_PTR(-ENOMEM); RB_CLEAR_NODE(&new_rule->node); if (is_object_pointer(id.type)) { - /* This should be catched by insert_rule(). */ + /* This should have been caught by insert_rule(). */ WARN_ON_ONCE(!id.key.object); landlock_get_object(id.key.object); } -- GitLab From 192b7ff29b1fb0335a9b9107991e6286f462f361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnther=20Noack?= Date: Fri, 24 Jan 2025 15:44:44 +0000 Subject: [PATCH 589/989] landlock: Minor typo and grammar fixes in IPC scoping documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix some whitespace, punctuation and minor grammar. * Add a missing sentence about the minimum ABI version, to stay in line with the section next to it. Cc: Tahera Fahimi Cc: Tanya Agarwal Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20250124154445.162841-1-gnoack@google.com [mic: Add newlines, update doc date] Signed-off-by: Mickaël Salaün --- Documentation/userspace-api/landlock.rst | 6 +++--- include/uapi/linux/landlock.h | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst index d639c61cb472a..ad587f53fe417 100644 --- a/Documentation/userspace-api/landlock.rst +++ b/Documentation/userspace-api/landlock.rst @@ -8,7 +8,7 @@ Landlock: unprivileged access control ===================================== :Author: Mickaël Salaün -:Date: October 2024 +:Date: January 2025 The goal of Landlock is to enable restriction of ambient rights (e.g. global filesystem or network access) for a set of processes. Because Landlock @@ -329,11 +329,11 @@ non-sandboxed process, we can specify this restriction with A sandboxed process can connect to a non-sandboxed process when its domain is not scoped. If a process's domain is scoped, it can only connect to sockets created by processes in the same scope. -Moreover, If a process is scoped to send signal to a non-scoped process, it can +Moreover, if a process is scoped to send signal to a non-scoped process, it can only send signals to processes in the same scope. A connected datagram socket behaves like a stream socket when its domain is -scoped, meaning if the domain is scoped after the socket is connected , it can +scoped, meaning if the domain is scoped after the socket is connected, it can still :manpage:`send(2)` data just like a stream socket. However, in the same scenario, a non-connected datagram socket cannot send data (with :manpage:`sendto(2)`) outside its scope. diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 33745642f7875..e1d2c27533b49 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -268,7 +268,9 @@ struct landlock_net_port_attr { * ~~~~~~~~~~~~~~~~ * * These flags enable to restrict a sandboxed process to a set of network - * actions. This is supported since the Landlock ABI version 4. + * actions. + * + * This is supported since Landlock ABI version 4. * * The following access rights apply to TCP port numbers: * @@ -291,11 +293,13 @@ struct landlock_net_port_attr { * Setting a flag for a ruleset will isolate the Landlock domain to forbid * connections to resources outside the domain. * + * This is supported since Landlock ABI version 6. + * * Scopes: * * - %LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET: Restrict a sandboxed process from * connecting to an abstract UNIX socket created by a process outside the - * related Landlock domain (e.g. a parent domain or a non-sandboxed process). + * related Landlock domain (e.g., a parent domain or a non-sandboxed process). * - %LANDLOCK_SCOPE_SIGNAL: Restrict a sandboxed process from sending a signal * to another process outside the domain. */ -- GitLab From 854277e2cc8c75dc3c216c82e72523258fcf65b9 Mon Sep 17 00:00:00 2001 From: Mikhail Ivanov Date: Wed, 5 Feb 2025 17:36:49 +0800 Subject: [PATCH 590/989] landlock: Fix non-TCP sockets restriction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use sk_is_tcp() to check if socket is TCP in bind(2) and connect(2) hooks. SMC, MPTCP, SCTP protocols are currently restricted by TCP access rights. The purpose of TCP access rights is to provide control over ports that can be used by userland to establish a TCP connection. Therefore, it is incorrect to deny bind(2) and connect(2) requests for a socket of another protocol. However, SMC, MPTCP and RDS implementations use TCP internal sockets to establish communication or even to exchange packets over a TCP connection [1]. Landlock rules that configure bind(2) and connect(2) usage for TCP sockets should not cover requests for sockets of such protocols. These protocols have different set of security issues and security properties, therefore, it is necessary to provide the userland with the ability to distinguish between them (eg. [2]). Control over TCP connection used by other protocols can be achieved with upcoming support of socket creation control [3]. [1] https://lore.kernel.org/all/62336067-18c2-3493-d0ec-6dd6a6d3a1b5@huawei-partners.com/ [2] https://lore.kernel.org/all/20241204.fahVio7eicim@digikod.net/ [3] https://lore.kernel.org/all/20240904104824.1844082-1-ivanov.mikhail1@huawei-partners.com/ Closes: https://github.com/landlock-lsm/linux/issues/40 Fixes: fff69fb03dde ("landlock: Support network rules with TCP bind and connect") Signed-off-by: Mikhail Ivanov Link: https://lore.kernel.org/r/20250205093651.1424339-2-ivanov.mikhail1@huawei-partners.com [mic: Format commit message to 72 columns] Signed-off-by: Mickaël Salaün --- security/landlock/net.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/security/landlock/net.c b/security/landlock/net.c index d5dcc4407a197..104b6c01fe503 100644 --- a/security/landlock/net.c +++ b/security/landlock/net.c @@ -63,8 +63,7 @@ static int current_check_access_socket(struct socket *const sock, if (WARN_ON_ONCE(dom->num_layers < 1)) return -EACCES; - /* Checks if it's a (potential) TCP socket. */ - if (sock->type != SOCK_STREAM) + if (!sk_is_tcp(sock->sk)) return 0; /* Checks for minimal header length to safely read sa_family. */ -- GitLab From f5534d511bcd273720f168386de74af76e148a9b Mon Sep 17 00:00:00 2001 From: Mikhail Ivanov Date: Wed, 5 Feb 2025 17:36:50 +0800 Subject: [PATCH 591/989] selftests/landlock: Test TCP accesses with protocol=IPPROTO_TCP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend protocol_variant structure with protocol field (Cf. socket(2)). Extend protocol fixture with TCP test suits with protocol=IPPROTO_TCP which can be used as an alias for IPPROTO_IP (=0) in socket(2). Signed-off-by: Mikhail Ivanov Link: https://lore.kernel.org/r/20250205093651.1424339-3-ivanov.mikhail1@huawei-partners.com Cc: # 6.7.x Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/common.h | 1 + tools/testing/selftests/landlock/net_test.c | 80 +++++++++++++++++---- 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index a604ea5d8297c..6064c9ac05329 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -207,6 +207,7 @@ enforce_ruleset(struct __test_metadata *const _metadata, const int ruleset_fd) struct protocol_variant { int domain; int type; + int protocol; }; struct service_fixture { diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 4e0aeb53b225a..333263780fae4 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -85,18 +85,18 @@ static void setup_loopback(struct __test_metadata *const _metadata) clear_ambient_cap(_metadata, CAP_NET_ADMIN); } +static bool prot_is_tcp(const struct protocol_variant *const prot) +{ + return (prot->domain == AF_INET || prot->domain == AF_INET6) && + prot->type == SOCK_STREAM && + (prot->protocol == IPPROTO_TCP || prot->protocol == IPPROTO_IP); +} + static bool is_restricted(const struct protocol_variant *const prot, const enum sandbox_type sandbox) { - switch (prot->domain) { - case AF_INET: - case AF_INET6: - switch (prot->type) { - case SOCK_STREAM: - return sandbox == TCP_SANDBOX; - } - break; - } + if (sandbox == TCP_SANDBOX) + return prot_is_tcp(prot); return false; } @@ -105,7 +105,7 @@ static int socket_variant(const struct service_fixture *const srv) int ret; ret = socket(srv->protocol.domain, srv->protocol.type | SOCK_CLOEXEC, - 0); + srv->protocol.protocol); if (ret < 0) return -errno; return ret; @@ -290,22 +290,48 @@ FIXTURE_TEARDOWN(protocol) } /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp) { +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp1) { /* clang-format on */ .sandbox = NO_SANDBOX, .prot = { .domain = AF_INET, .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, }, }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp) { +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp2) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp1) { /* clang-format on */ .sandbox = NO_SANDBOX, .prot = { .domain = AF_INET6, .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp2) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, }, }; @@ -350,22 +376,48 @@ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_unix_datagram) { }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp) { +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp1) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp2) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { .domain = AF_INET, .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + }, +}; + +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp1) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + /* IPPROTO_IP == 0 */ + .protocol = IPPROTO_IP, }, }; /* clang-format off */ -FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp) { +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp2) { /* clang-format on */ .sandbox = TCP_SANDBOX, .prot = { .domain = AF_INET6, .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, }, }; -- GitLab From 3d4033985ff508ef587ca11f1c8361ba57c7e09f Mon Sep 17 00:00:00 2001 From: Mikhail Ivanov Date: Wed, 5 Feb 2025 17:36:51 +0800 Subject: [PATCH 592/989] selftests/landlock: Test that MPTCP actions are not restricted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend protocol fixture with test suits for MPTCP protocol. Add CONFIG_MPTCP and CONFIG_MPTCP_IPV6 options in config. Signed-off-by: Mikhail Ivanov Link: https://lore.kernel.org/r/20250205093651.1424339-4-ivanov.mikhail1@huawei-partners.com Cc: # 6.7.x Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/config | 2 + tools/testing/selftests/landlock/net_test.c | 44 +++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config index 361f94f8cb0d4..425de4c20271c 100644 --- a/tools/testing/selftests/landlock/config +++ b/tools/testing/selftests/landlock/config @@ -4,6 +4,8 @@ CONFIG_CGROUP_SCHED=y CONFIG_INET=y CONFIG_IPV6=y CONFIG_KEYS=y +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y CONFIG_NET=y CONFIG_NET_NS=y CONFIG_OVERLAY_FS=y diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c index 333263780fae4..d9de0ee49ebc2 100644 --- a/tools/testing/selftests/landlock/net_test.c +++ b/tools/testing/selftests/landlock/net_test.c @@ -312,6 +312,17 @@ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_tcp2) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_mptcp) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + /* clang-format off */ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp1) { /* clang-format on */ @@ -335,6 +346,17 @@ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_tcp2) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv6_mptcp) { + /* clang-format on */ + .sandbox = NO_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + /* clang-format off */ FIXTURE_VARIANT_ADD(protocol, no_sandbox_with_ipv4_udp) { /* clang-format on */ @@ -398,6 +420,17 @@ FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_tcp2) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_mptcp) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + /* clang-format off */ FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp1) { /* clang-format on */ @@ -421,6 +454,17 @@ FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_tcp2) { }, }; +/* clang-format off */ +FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv6_mptcp) { + /* clang-format on */ + .sandbox = TCP_SANDBOX, + .prot = { + .domain = AF_INET6, + .type = SOCK_STREAM, + .protocol = IPPROTO_MPTCP, + }, +}; + /* clang-format off */ FIXTURE_VARIANT_ADD(protocol, tcp_sandbox_with_ipv4_udp) { /* clang-format on */ -- GitLab From 78332fdb956f18accfbca5993b10c5ed69f00a2c Mon Sep 17 00:00:00 2001 From: Bharadwaj Raju Date: Mon, 10 Feb 2025 21:40:57 +0530 Subject: [PATCH 593/989] selftests/landlock: Add binaries to .gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building the test creates binaries 'wait-pipe' and 'sandbox-and-launch' which need to be gitignore'd. Signed-off-by: Bharadwaj Raju Link: https://lore.kernel.org/r/20250210161101.6024-1-bharadwaj.raju777@gmail.com [mic: Sort entries] Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore index 470203a7cd737..335b2b1a3463a 100644 --- a/tools/testing/selftests/landlock/.gitignore +++ b/tools/testing/selftests/landlock/.gitignore @@ -1,2 +1,4 @@ /*_test +/sandbox-and-launch /true +/wait-pipe -- GitLab From d3a8c28426fc1fb3252753a9f1db0d691ffc21b0 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Sat, 1 Feb 2025 22:09:02 +0530 Subject: [PATCH 594/989] usb: dwc3: Fix timeout issue during controller enter/exit from halt state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a frequent timeout during controller enter/exit from halt state after toggling the run_stop bit by SW. This timeout occurs when performing frequent role switches between host and device, causing device enumeration issues due to the timeout. This issue was not present when USB2 suspend PHY was disabled by passing the SNPS quirks (snps,dis_u2_susphy_quirk and snps,dis_enblslpm_quirk) from the DTS. However, there is a requirement to enable USB2 suspend PHY by setting of GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY bits when controller starts in gadget or host mode results in the timeout issue. This commit addresses this timeout issue by ensuring that the bits GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY are cleared before starting the dwc3_gadget_run_stop sequence and restoring them after the dwc3_gadget_run_stop sequence is completed. Fixes: 72246da40f37 ("usb: Introduce DesignWare USB3 DRD Driver") Cc: stable Signed-off-by: Selvarasu Ganesan Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250201163903.459-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d27af65eb08ae..ddd6b2ce57107 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2629,10 +2629,38 @@ static int dwc3_gadget_run_stop(struct dwc3 *dwc, int is_on) { u32 reg; u32 timeout = 2000; + u32 saved_config = 0; if (pm_runtime_suspended(dwc->dev)) return 0; + /* + * When operating in USB 2.0 speeds (HS/FS), ensure that + * GUSB2PHYCFG.ENBLSLPM and GUSB2PHYCFG.SUSPHY are cleared before starting + * or stopping the controller. This resolves timeout issues that occur + * during frequent role switches between host and device modes. + * + * Save and clear these settings, then restore them after completing the + * controller start or stop sequence. + * + * This solution was discovered through experimentation as it is not + * mentioned in the dwc3 programming guide. It has been tested on an + * Exynos platforms. + */ + reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); + if (reg & DWC3_GUSB2PHYCFG_SUSPHY) { + saved_config |= DWC3_GUSB2PHYCFG_SUSPHY; + reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; + } + + if (reg & DWC3_GUSB2PHYCFG_ENBLSLPM) { + saved_config |= DWC3_GUSB2PHYCFG_ENBLSLPM; + reg &= ~DWC3_GUSB2PHYCFG_ENBLSLPM; + } + + if (saved_config) + dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); + reg = dwc3_readl(dwc->regs, DWC3_DCTL); if (is_on) { if (DWC3_VER_IS_WITHIN(DWC3, ANY, 187A)) { @@ -2660,6 +2688,12 @@ static int dwc3_gadget_run_stop(struct dwc3 *dwc, int is_on) reg &= DWC3_DSTS_DEVCTRLHLT; } while (--timeout && !(!is_on ^ !reg)); + if (saved_config) { + reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); + reg |= saved_config; + dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); + } + if (!timeout) return -ETIMEDOUT; -- GitLab From 4aac0db5a0ebc599d4ad9bf5ebab78afa1f33e10 Mon Sep 17 00:00:00 2001 From: Stefan Eichenberger Date: Mon, 3 Feb 2025 11:58:24 +0100 Subject: [PATCH 595/989] usb: core: fix pipe creation for get_bMaxPacketSize0 When usb_control_msg is used in the get_bMaxPacketSize0 function, the USB pipe does not include the endpoint device number. This can cause failures when a usb hub port is reinitialized after encountering a bad cable connection. As a result, the system logs the following error messages: usb usb2-port1: cannot reset (err = -32) usb usb2-port1: Cannot enable. Maybe the USB cable is bad? usb usb2-port1: attempt power cycle usb 2-1: new high-speed USB device number 5 using ci_hdrc usb 2-1: device descriptor read/8, error -71 The problem began after commit 85d07c556216 ("USB: core: Unite old scheme and new scheme descriptor reads"). There usb_get_device_descriptor was replaced with get_bMaxPacketSize0. Unlike usb_get_device_descriptor, the get_bMaxPacketSize0 function uses the macro usb_rcvaddr0pipe, which does not include the endpoint device number. usb_get_device_descriptor, on the other hand, used the macro usb_rcvctrlpipe, which includes the endpoint device number. By modifying the get_bMaxPacketSize0 function to use usb_rcvctrlpipe instead of usb_rcvaddr0pipe, the issue can be resolved. This change will ensure that the endpoint device number is included in the USB pipe, preventing reinitialization failures. If the endpoint has not set the device number yet, it will still work because the device number is 0 in udev. Cc: stable Fixes: 85d07c556216 ("USB: core: Unite old scheme and new scheme descriptor reads") Signed-off-by: Stefan Eichenberger Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20250203105840.17539-1-eichest@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 0cd44f1fd56d2..a76bb50b62026 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4709,7 +4709,6 @@ void usb_ep0_reinit(struct usb_device *udev) EXPORT_SYMBOL_GPL(usb_ep0_reinit); #define usb_sndaddr0pipe() (PIPE_CONTROL << 30) -#define usb_rcvaddr0pipe() ((PIPE_CONTROL << 30) | USB_DIR_IN) static int hub_set_address(struct usb_device *udev, int devnum) { @@ -4815,7 +4814,7 @@ static int get_bMaxPacketSize0(struct usb_device *udev, for (i = 0; i < GET_MAXPACKET0_TRIES; ++i) { /* Start with invalid values in case the transfer fails */ buf->bDescriptorType = buf->bMaxPacketSize0 = 0; - rc = usb_control_msg(udev, usb_rcvaddr0pipe(), + rc = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, USB_DT_DEVICE << 8, 0, buf, size, -- GitLab From 4ab37fcb42832cdd3e9d5e50653285ca84d6686f Mon Sep 17 00:00:00 2001 From: Jill Donahue Date: Tue, 11 Feb 2025 10:48:05 -0700 Subject: [PATCH 596/989] USB: gadget: f_midi: f_midi_complete to call queue_work When using USB MIDI, a lock is attempted to be acquired twice through a re-entrant call to f_midi_transmit, causing a deadlock. Fix it by using queue_work() to schedule the inner f_midi_transmit() via a high priority work queue from the completion handler. Link: https://lore.kernel.org/all/CAArt=LjxU0fUZOj06X+5tkeGT+6RbXzpWg1h4t4Fwa_KGVAX6g@mail.gmail.com/ Fixes: d5daf49b58661 ("USB: gadget: midi: add midi function driver") Cc: stable Signed-off-by: Jill Donahue Link: https://lore.kernel.org/r/20250211174805.1369265-1-jdonahue@fender.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_midi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c index 47260d65066a8..da82598fcef8a 100644 --- a/drivers/usb/gadget/function/f_midi.c +++ b/drivers/usb/gadget/function/f_midi.c @@ -283,7 +283,7 @@ f_midi_complete(struct usb_ep *ep, struct usb_request *req) /* Our transmit completed. See if there's more to go. * f_midi_transmit eats req, don't queue it again. */ req->length = 0; - f_midi_transmit(midi); + queue_work(system_highpri_wq, &midi->work); return; } break; -- GitLab From 399a45e5237ca14037120b1b895bd38a3b4492ea Mon Sep 17 00:00:00 2001 From: Roy Luo Date: Tue, 4 Feb 2025 23:36:42 +0000 Subject: [PATCH 597/989] usb: gadget: core: flush gadget workqueue after device removal device_del() can lead to new work being scheduled in gadget->work workqueue. This is observed, for example, with the dwc3 driver with the following call stack: device_del() gadget_unbind_driver() usb_gadget_disconnect_locked() dwc3_gadget_pullup() dwc3_gadget_soft_disconnect() usb_gadget_set_state() schedule_work(&gadget->work) Move flush_work() after device_del() to ensure the workqueue is cleaned up. Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue") Cc: stable Signed-off-by: Roy Luo Reviewed-by: Alan Stern Reviewed-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250204233642.666991-1-royluo@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index a6f46364be65f..4b3d5075621aa 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1543,8 +1543,8 @@ void usb_del_gadget(struct usb_gadget *gadget) kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); - flush_work(&gadget->work); device_del(&gadget->dev); + flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); -- GitLab From e169d96eecd447ff7fd7542ca5fa0911f5622054 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Wed, 12 Feb 2025 17:38:29 +0800 Subject: [PATCH 598/989] USB: quirks: add USB_QUIRK_NO_LPM quirk for Teclast dist Teclast disk used on Huawei hisi platforms doesn't work well, losing connectivity intermittently if LPM is enabled. Add quirk disable LPM to resolve the issue. Signed-off-by: Lei Huang Cc: stable Link: https://lore.kernel.org/r/20250212093829.7379-1-huanglei814@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 59ed9768dae1f..dfcfc142bd5e1 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -528,6 +528,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* Blackmagic Design UltraStudio SDI */ { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + /* Teclast disk */ + { USB_DEVICE(0x1f75, 0x0917), .driver_info = USB_QUIRK_NO_LPM }, + /* Hauppauge HVR-950q */ { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, -- GitLab From e5644be4079750a0a0a5a7068fd90b97bf6fac55 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 12 Feb 2025 14:55:14 +0100 Subject: [PATCH 599/989] usb: gadget: uvc: Fix unstarted kthread worker The behaviour of kthread_create_worker() was recently changed to align with the one of kthread_create(). The kthread worker is created but not awaken by default. This is to allow the use of kthread_affine_preferred() and kthread_bind[_mask]() with kthread workers. In order to keep the old behaviour and wake the kthread up, kthread_run_worker() must be used. All the pre-existing users have been converted, except for UVC that was introduced in the same merge window as the API change. This results in hangs: INFO: task UVCG:82 blocked for more than 491 seconds. Tainted: G T 6.13.0-rc2-00014-gb04e317b5226 #1 task:UVCG state:D stack:0 pid:82 Call Trace: __schedule schedule schedule_preempt_disabled kthread ? kthread_flush_work ret_from_fork ret_from_fork_asm entry_INT80_32 Fix this with converting UVCG kworker to the new API. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202502121025.55bfa801-lkp@intel.com Fixes: f0bbfbd16b3b ("usb: gadget: uvc: rework to enqueue in pump worker from encoded queue") Cc: stable Cc: Michael Grzeschik Signed-off-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20250212135514.30539-1-frederic@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/uvc_video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/uvc_video.c b/drivers/usb/gadget/function/uvc_video.c index 79e223713d8b9..fb77b0b217901 100644 --- a/drivers/usb/gadget/function/uvc_video.c +++ b/drivers/usb/gadget/function/uvc_video.c @@ -818,7 +818,7 @@ int uvcg_video_init(struct uvc_video *video, struct uvc_device *uvc) return -EINVAL; /* Allocate a kthread for asynchronous hw submit handler. */ - video->kworker = kthread_create_worker(0, "UVCG"); + video->kworker = kthread_run_worker(0, "UVCG"); if (IS_ERR(video->kworker)) { uvcg_err(&video->uvc->func, "failed to create UVCG kworker\n"); return PTR_ERR(video->kworker); -- GitLab From 634775a752a86784511018a108f3b530cc3399a7 Mon Sep 17 00:00:00 2001 From: Elson Roy Serrao Date: Thu, 6 Feb 2025 11:39:50 -0800 Subject: [PATCH 600/989] usb: roles: set switch registered flag early on The role switch registration and set_role() can happen in parallel as they are invoked independent of each other. There is a possibility that a driver might spend significant amount of time in usb_role_switch_register() API due to the presence of time intensive operations like component_add() which operate under common mutex. This leads to a time window after allocating the switch and before setting the registered flag where the set role notifications are dropped. Below timeline summarizes this behavior Thread1 | Thread2 usb_role_switch_register() | | | ---> allocate switch | | | ---> component_add() | usb_role_switch_set_role() | | | | | --> Drop role notifications | | since sw->registered | | flag is not set. | | --->Set registered flag.| To avoid this, set the registered flag early on in the switch register API. Fixes: b787a3e78175 ("usb: roles: don't get/set_role() when usb_role_switch is unregistered") Cc: stable Signed-off-by: Elson Roy Serrao Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250206193950.22421-1-quic_eserrao@quicinc.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/roles/class.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/roles/class.c b/drivers/usb/roles/class.c index c58a12c147f45..30482d4cf8267 100644 --- a/drivers/usb/roles/class.c +++ b/drivers/usb/roles/class.c @@ -387,8 +387,11 @@ usb_role_switch_register(struct device *parent, dev_set_name(&sw->dev, "%s-role-switch", desc->name ? desc->name : dev_name(parent)); + sw->registered = true; + ret = device_register(&sw->dev); if (ret) { + sw->registered = false; put_device(&sw->dev); return ERR_PTR(ret); } @@ -399,8 +402,6 @@ usb_role_switch_register(struct device *parent, dev_warn(&sw->dev, "failed to add component\n"); } - sw->registered = true; - /* TODO: Symlinks for the host port and the device controller. */ return sw; -- GitLab From 659f5d55feb75782bd46cf130da3c1f240afe9ba Mon Sep 17 00:00:00 2001 From: Jos Wang Date: Thu, 13 Feb 2025 21:49:21 +0800 Subject: [PATCH 601/989] usb: typec: tcpm: PSSourceOffTimer timeout in PR_Swap enters ERROR_RECOVERY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As PD2.0 spec ("6.5.6.2 PSSourceOffTimer"),the PSSourceOffTimer is used by the Policy Engine in Dual-Role Power device that is currently acting as a Sink to timeout on a PS_RDY Message during a Power Role Swap sequence. This condition leads to a Hard Reset for USB Type-A and Type-B Plugs and Error Recovery for Type-C plugs and return to USB Default Operation. Therefore, after PSSourceOffTimer timeout, the tcpm state machine should switch from PR_SWAP_SNK_SRC_SINK_OFF to ERROR_RECOVERY. This can also solve the test items in the USB power delivery compliance test: TEST.PD.PROT.SNK.12 PR_Swap – PSSourceOffTimer Timeout [1] https://usb.org/document-library/usb-power-delivery-compliance-test-specification-0/USB_PD3_CTS_Q4_2025_OR.zip Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)") Cc: stable Signed-off-by: Jos Wang Reviewed-by: Heikki Krogerus Tested-by: Amit Sunil Dhamne Link: https://lore.kernel.org/r/20250213134921.3798-1-joswang1221@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 47be450d2be35..6bf1a22c785af 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5591,8 +5591,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_USB, port->pps_data.active, 0); tcpm_set_charge(port, false); - tcpm_set_state(port, hard_reset_state(port), - port->timings.ps_src_off_time); + tcpm_set_state(port, ERROR_RECOVERY, port->timings.ps_src_off_time); break; case PR_SWAP_SNK_SRC_SOURCE_ON: tcpm_enable_auto_vbus_discharge(port, true); -- GitLab From 66314e9a57a050f95cb0ebac904f5ab047a8926e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 2 Feb 2025 16:50:14 -0800 Subject: [PATCH 602/989] xfs: fix online repair probing when CONFIG_XFS_ONLINE_REPAIR=n I received a report from the release engineering side of the house that xfs_scrub without the -n flag (aka fix it mode) would try to fix a broken filesystem even on a kernel that doesn't have online repair built into it: # xfs_scrub -dTvn /mnt/test EXPERIMENTAL xfs_scrub program in use! Use at your own risk! Phase 1: Find filesystem geometry. /mnt/test: using 1 threads to scrub. Phase 1: Memory used: 132k/0k (108k/25k), time: 0.00/ 0.00/ 0.00s Phase 4: Repair filesystem. Info: /mnt/test/some/victimdir directory entries: Attempting repair. (repair.c line 351) Corruption: /mnt/test/some/victimdir directory entries: Repair unsuccessful; offline repair required. (repair.c line 204) Source: https://blogs.oracle.com/linux/post/xfs-online-filesystem-repair It is strange that xfs_scrub doesn't refuse to run, because the kernel is supposed to return EOPNOTSUPP if we actually needed to run a repair, and xfs_io's repair subcommand will perror that. And yet: # xfs_io -x -c 'repair probe' /mnt/test # The first problem is commit dcb660f9222fd9 (4.15) which should have had xchk_probe set the CORRUPT OFLAG so that any of the repair machinery will get called at all. It turns out that some refactoring that happened in the 6.6-6.8 era broke the operation of this corner case. What we *really* want to happen is that all the predicates that would steer xfs_scrub_metadata() towards calling xrep_attempt() should function the same way that they do when repair is compiled in; and then xrep_attempt gets to return the fatal EOPNOTSUPP error code that causes the probe to fail. Instead, commit 8336a64eb75cba (6.6) started the failwhale swimming by hoisting OFLAG checking logic into a helper whose non-repair stub always returns false, causing scrub to return "repair not needed" when in fact the repair is not supported. Prior to that commit, the oflag checking that was open-coded in scrub.c worked correctly. Similarly, in commit 4bdfd7d15747b1 (6.8) we hoisted the IFLAG_REPAIR and ALREADY_FIXED logic into a helper whose non-repair stub always returns false, so we never enter the if test body that would have called xrep_attempt, let alone fail to decode the OFLAGs correctly. The final insult (yes, we're doing The Naked Gun now) is commit 48a72f60861f79 (6.8) in which we hoisted the "are we going to try a repair?" predicate into yet another function with a non-repair stub always returns false. Fix xchk_probe to trigger xrep_probe if repair is enabled, or return EOPNOTSUPP directly if it is not. For all the other scrub types, we need to fix the header predicates so that the ->repair functions (which are all xrep_notsupported) get called to return EOPNOTSUPP. Commit 48a72 is tagged here because the scrub code prior to LTS 6.12 are incomplete and not worth patching. Reported-by: David Flynn Cc: # v6.8 Fixes: 8336a64eb75c ("xfs: don't complain about unfixed metadata when repairs were injected") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/common.h | 5 ----- fs/xfs/scrub/repair.h | 11 ++++++++++- fs/xfs/scrub/scrub.c | 12 ++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index bdcd40f0ec742..19877d99f255b 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) bool xchk_dir_looks_zapped(struct xfs_inode *dp); bool xchk_pptr_looks_zapped(struct xfs_inode *ip); -#ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) { @@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc) return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED); } -#else -# define xchk_needs_repair(sc) (false) -# define xchk_could_repair(sc) (false) -#endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 823c00d1a5026..af0a3a9e5ed97 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -191,7 +191,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) -#define xrep_will_attempt(sc) (false) + +/* + * When online repair is not built into the kernel, we still want to attempt + * the repair so that the stub xrep_attempt below will return EOPNOTSUPP. + */ +static inline bool xrep_will_attempt(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + xchk_needs_repair(sc->sm); +} static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7567dd5cad14f..6fa9e3e5bab7b 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -149,6 +149,18 @@ xchk_probe( if (xchk_should_terminate(sc, &error)) return error; + /* + * If the caller is probing to see if repair works but repair isn't + * built into the kernel, return EOPNOTSUPP because that's the signal + * that userspace expects. If online repair is built in, set the + * CORRUPT flag (without any of the usual tracing/logging) to force us + * into xrep_probe. + */ + if (xchk_could_repair(sc)) { + if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)) + return -EOPNOTSUPP; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + } return 0; } -- GitLab From 6e33017c3276e3af7f79f61f3b3648e4a4c03d34 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 2 Feb 2025 16:50:14 -0800 Subject: [PATCH 603/989] xfs: fix data fork format filtering during inode repair Coverity noticed that xrep_dinode_bad_metabt_fork never runs because XFS_DINODE_FMT_META_BTREE is always filtered out in the mode selection switch of xrep_dinode_check_dfork. Metadata btrees are allowed only in the data forks of regular files, so add this case explicitly. I guess this got fubard during a refactoring prior to 6.13 and I didn't notice until now. :/ Coverity-id: 1617714 Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/inode_repair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 2f641b6d663eb..13ff1c933cb8f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1055,9 +1055,17 @@ xrep_dinode_check_dfork( return true; break; case S_IFREG: - if (fmt == XFS_DINODE_FMT_LOCAL) + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: return true; - fallthrough; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_META_BTREE: + break; + default: + return true; + } + break; case S_IFLNK: case S_IFDIR: switch (fmt) { -- GitLab From 9e00163c31676c6b43d2334fdf5b406232f42dee Mon Sep 17 00:00:00 2001 From: Lukas Herbolt Date: Mon, 3 Feb 2025 09:55:13 +0100 Subject: [PATCH 604/989] xfs: do not check NEEDSREPAIR if ro,norecovery mount. If there is corrutpion on the filesystem andxfs_repair fails to repair it. The last resort of getting the data is to use norecovery,ro mount. But if the NEEDSREPAIR is set the filesystem cannot be mounted. The flag must be cleared out manually using xfs_db, to get access to what left over of the corrupted fs. Signed-off-by: Lukas Herbolt Reviewed-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d92d7a07ea89b..0055066fb1d98 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1661,8 +1661,12 @@ xfs_fs_fill_super( #endif } - /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_has_needsrepair(mp)) { + /* + * Filesystem claims it needs repair, so refuse the mount unless + * norecovery is also specified, in which case the filesystem can + * be mounted with no risk of further damage. + */ + if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; -- GitLab From 9f0902091c332b2665951cfb970f60ae7cbdc0f3 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 3 Feb 2025 14:04:57 +0100 Subject: [PATCH 605/989] xfs: Do not allow norecovery mount with quotacheck Mounting a filesystem that requires quota state changing will generate a transaction. We already check for a read-only device; we should do that for norecovery too. A quotacheck on a norecovery mount, and with the right log size, will cause the mount process to hang on: [<0>] xlog_grant_head_wait+0x5d/0x2a0 [xfs] [<0>] xlog_grant_head_check+0x112/0x180 [xfs] [<0>] xfs_log_reserve+0xe3/0x260 [xfs] [<0>] xfs_trans_reserve+0x179/0x250 [xfs] [<0>] xfs_trans_alloc+0x101/0x260 [xfs] [<0>] xfs_sync_sb+0x3f/0x80 [xfs] [<0>] xfs_qm_mount_quotas+0xe3/0x2f0 [xfs] [<0>] xfs_mountfs+0x7ad/0xc20 [xfs] [<0>] xfs_fs_fill_super+0x762/0xa50 [xfs] [<0>] get_tree_bdev_flags+0x131/0x1d0 [<0>] vfs_get_tree+0x26/0xd0 [<0>] vfs_cmd_create+0x59/0xe0 [<0>] __do_sys_fsconfig+0x4e3/0x6b0 [<0>] do_syscall_64+0x82/0x160 [<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e This is caused by a transaction running with bogus initialized head/tail I initially hit this while running generic/050, with random log sizes, but I managed to reproduce it reliably here with the steps below: mkfs.xfs -f -lsize=1025M -f -b size=4096 -m crc=1,reflink=1,rmapbt=1, -i sparse=1 /dev/vdb2 > /dev/null mount -o usrquota,grpquota,prjquota /dev/vdb2 /mnt xfs_io -x -c 'shutdown -f' /mnt umount /mnt mount -o ro,norecovery,usrquota,grpquota,prjquota /dev/vdb2 /mnt Last mount hangs up As we add yet another validation if quota state is changing, this also add a new helper named xfs_qm_validate_state_change(), factoring the quota state changes out of xfs_qm_newmount() to reduce cluttering within it. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_qm_bhv.c | 55 ++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 37f1230e75846..245d754f382a7 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -78,6 +78,28 @@ xfs_qm_statvfs( } } +STATIC int +xfs_qm_validate_state_change( + struct xfs_mount *mp, + uint uqd, + uint gqd, + uint pqd) +{ + int state; + + /* Is quota state changing? */ + state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) || + (!uqd && XFS_IS_UQUOTA_ON(mp)) || + (gqd && !XFS_IS_GQUOTA_ON(mp)) || + (!gqd && XFS_IS_GQUOTA_ON(mp)) || + (pqd && !XFS_IS_PQUOTA_ON(mp)) || + (!pqd && XFS_IS_PQUOTA_ON(mp))); + + return state && + (xfs_dev_is_read_only(mp, "changing quota state") || + xfs_has_norecovery(mp)); +} + int xfs_qm_newmount( xfs_mount_t *mp, @@ -97,24 +119,25 @@ xfs_qm_newmount( } /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown + * If the device itself is read-only and/or in norecovery + * mode, we can't allow the user to change the state of + * quota on the mount - this would generate a transaction + * on the ro device, which would lead to an I/O error and + * shutdown. */ - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - xfs_warn(mp, "please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (gquotaondisk ? " grpquota" : ""), - (pquotaondisk ? " prjquota" : "")); + if (xfs_qm_validate_state_change(mp, uquotaondisk, + gquotaondisk, pquotaondisk)) { + + if (xfs_has_metadir(mp)) + xfs_warn(mp, + "metadir enabled, please mount without any quota mount options"); + else + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); return -EPERM; } -- GitLab From 3cd6a8056f5a2e794c42fc2114ee2611e358b357 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:15:01 +0100 Subject: [PATCH 606/989] xfs: rename xfs_iomap_swapfile_activate to xfs_vm_swap_activate Match the method name and the naming convention or address_space operations. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_aops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 67877c36ed11a..a80608e82c9b9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -528,7 +528,7 @@ xfs_vm_readahead( } static int -xfs_iomap_swapfile_activate( +xfs_vm_swap_activate( struct swap_info_struct *sis, struct file *swap_file, sector_t *span) @@ -549,11 +549,11 @@ const struct address_space_operations xfs_address_space_operations = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .dirty_folio = noop_dirty_folio, - .swap_activate = xfs_iomap_swapfile_activate, + .swap_activate = xfs_vm_swap_activate, }; -- GitLab From 2d873efd174bae9005776937d5ac6a96050266db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Feb 2025 07:15:00 +0100 Subject: [PATCH 607/989] xfs: flush inodegc before swapon Fix the brand new xfstest that tries to swapon on a recently unshared file and use the chance to document the other bit of magic in this function. The big comment is taken from a mailinglist post by Dave Chinner. Fixes: 5e672cd69f0a53 ("xfs: introduce xfs_inodegc_push()") Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_aops.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a80608e82c9b9..6d9965b546cbb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -19,6 +19,7 @@ #include "xfs_reflink.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_icache.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -533,7 +534,39 @@ xfs_vm_swap_activate( struct file *swap_file, sector_t *span) { - sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev; + struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + + /* + * Swap file activation can race against concurrent shared extent + * removal in files that have been cloned. If this happens, + * iomap_swapfile_iter() can fail because it encountered a shared + * extent even though an operation is in progress to remove those + * shared extents. + * + * This race becomes problematic when we defer extent removal + * operations beyond the end of a syscall (i.e. use async background + * processing algorithms). Users think the extents are no longer + * shared, but iomap_swapfile_iter() still sees them as shared + * because the refcountbt entries for the extents being removed have + * not yet been updated. Hence the swapon call fails unexpectedly. + * + * The race condition is currently most obvious from the unlink() + * operation as extent removal is deferred until after the last + * reference to the inode goes away. We then process the extent + * removal asynchronously, hence triggers the "syscall completed but + * work not done" condition mentioned above. To close this race + * window, we need to flush any pending inodegc operations to ensure + * they have updated the refcountbt records before we try to map the + * swapfile. + */ + xfs_inodegc_flush(ip->i_mount); + + /* + * Direct the swap code to the correct block device when this file + * sits on the RT device. + */ + sis->bdev = xfs_inode_buftarg(ip)->bt_bdev; + return iomap_swapfile_activate(sis, swap_file, span, &xfs_read_iomap_ops); } -- GitLab From 9e512eaaf8f4008c44ede3dfc0fbc9d9c5118583 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Sat, 8 Feb 2025 12:41:44 +0000 Subject: [PATCH 608/989] serial: 8250: Fix fifo underflow on flush When flushing the serial port's buffer, uart_flush_buffer() calls kfifo_reset() but if there is an outstanding DMA transfer then the completion function will consume data from the kfifo via uart_xmit_advance(), underflowing and leading to ongoing DMA as the driver tries to transmit another 2^32 bytes. This is readily reproduced with serial-generic and amidi sending even short messages as closing the device on exit will wait for the fifo to drain and in the underflow case amidi hangs for 30 seconds on exit in tty_wait_until_sent(). A trace of that gives: kworker/1:1-84 [001] 51.769423: bprint: serial8250_tx_dma: tx_size=3 fifo_len=3 amidi-763 [001] 51.769460: bprint: uart_flush_buffer: resetting fifo irq/21-fe530000-76 [000] 51.769474: bprint: __dma_tx_complete: tx_size=3 irq/21-fe530000-76 [000] 51.769479: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294967293 irq/21-fe530000-76 [000] 51.781295: bprint: __dma_tx_complete: tx_size=4096 irq/21-fe530000-76 [000] 51.781301: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294963197 irq/21-fe530000-76 [000] 51.793131: bprint: __dma_tx_complete: tx_size=4096 irq/21-fe530000-76 [000] 51.793135: bprint: serial8250_tx_dma: tx_size=4096 fifo_len=4294959101 irq/21-fe530000-76 [000] 51.804949: bprint: __dma_tx_complete: tx_size=4096 Since the port lock is held in when the kfifo is reset in uart_flush_buffer() and in __dma_tx_complete(), adding a flush_buffer hook to adjust the outstanding DMA byte count is sufficient to avoid the kfifo underflow. Fixes: 9ee4b83e51f74 ("serial: 8250: Add support for dmaengine") Cc: stable Signed-off-by: John Keeping Link: https://lore.kernel.org/r/20250208124148.1189191-1-jkeeping@inmusicbrands.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250.h | 2 ++ drivers/tty/serial/8250/8250_dma.c | 16 ++++++++++++++++ drivers/tty/serial/8250/8250_port.c | 9 +++++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h index 11e05aa014e54..b861585ca02ac 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -374,6 +374,7 @@ static inline int is_omap1510_8250(struct uart_8250_port *pt) #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); +extern void serial8250_tx_dma_flush(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); @@ -406,6 +407,7 @@ static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } +static inline void serial8250_tx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index d215c494ee24c..f245a84f4a508 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -149,6 +149,22 @@ int serial8250_tx_dma(struct uart_8250_port *p) return ret; } +void serial8250_tx_dma_flush(struct uart_8250_port *p) +{ + struct uart_8250_dma *dma = p->dma; + + if (!dma->tx_running) + return; + + /* + * kfifo_reset() has been called by the serial core, avoid + * advancing and underflowing in __dma_tx_complete(). + */ + dma->tx_size = 0; + + dmaengine_terminate_async(dma->rxchan); +} + int serial8250_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index d7976a21cca9c..442967a6cd52d 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2555,6 +2555,14 @@ static void serial8250_shutdown(struct uart_port *port) serial8250_do_shutdown(port); } +static void serial8250_flush_buffer(struct uart_port *port) +{ + struct uart_8250_port *up = up_to_u8250p(port); + + if (up->dma) + serial8250_tx_dma_flush(up); +} + static unsigned int serial8250_do_get_divisor(struct uart_port *port, unsigned int baud, unsigned int *frac) @@ -3244,6 +3252,7 @@ static const struct uart_ops serial8250_pops = { .break_ctl = serial8250_break_ctl, .startup = serial8250_startup, .shutdown = serial8250_shutdown, + .flush_buffer = serial8250_flush_buffer, .set_termios = serial8250_set_termios, .set_ldisc = serial8250_set_ldisc, .pm = serial8250_pm, -- GitLab From 0c67c37e1710b2a8f61c8a02db95a51fe577e2c1 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Feb 2025 13:47:50 -0800 Subject: [PATCH 609/989] fuse: revert back to __readahead_folio() for readahead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 3eab9d7bc2f4 ("fuse: convert readahead to use folios"), the logic was converted to using the new folio readahead code, which drops the reference on the folio once it is locked, using an inferred reference on the folio. Previously we held a reference on the folio for the entire duration of the readpages call. This is fine, however for the case for splice pipe responses where we will remove the old folio and splice in the new folio (see fuse_try_move_page()), we assume that there is a reference held on the folio for ap->folios, which is no longer the case. To fix this, revert back to __readahead_folio() which allows us to hold the reference on the folio for the duration of readpages until either we drop the reference ourselves in fuse_readpages_end() or the reference is dropped after it's replaced in the page cache in the splice case. This will fix the UAF bug that was reported. Link: https://lore.kernel.org/linux-fsdevel/2f681f48-00f5-4e09-8431-2b3dbfaa881e@heusel.eu/ Fixes: 3eab9d7bc2f4 ("fuse: convert readahead to use folios") Reported-by: Christian Heusel Closes: https://lore.kernel.org/all/2f681f48-00f5-4e09-8431-2b3dbfaa881e@heusel.eu/ Closes: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/110 Reported-by: Mantas Mikulėnas Closes: https://lore.kernel.org/all/34feb867-09e2-46e4-aa31-d9660a806d1a@gmail.com/ Closes: https://bugzilla.opensuse.org/show_bug.cgi?id=1236660 Cc: # v6.13 Signed-off-by: Joanne Koong Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 6 ++++++ fs/fuse/file.c | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5b5f789b37eb6..2b2d1b7555444 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -838,6 +838,12 @@ static int fuse_check_folio(struct folio *folio) return 0; } +/* + * Attempt to steal a page from the splice() pipe and move it into the + * pagecache. If successful, the pointer in @pagep will be updated. The + * folio that was originally in @pagep will lose a reference and the new + * folio returned in @pagep will carry a reference. + */ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d92a54799985..d63e56fd3dd20 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -955,8 +955,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_folios; i++) + for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); + } if (ia->ff) fuse_file_put(ia->ff, false); @@ -1048,7 +1050,14 @@ static void fuse_readahead(struct readahead_control *rac) ap = &ia->ap; while (ap->num_folios < cur_pages) { - folio = readahead_folio(rac); + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + folio = __readahead_folio(rac); ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; -- GitLab From 8789b4296aa796f658a19cac7d27365012893de1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Thu, 5 Dec 2024 10:22:00 +0000 Subject: [PATCH 610/989] phy: exynos5-usbdrd: gs101: ensure power is gated to SS phy in phy_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently don't gate the power to the SS phy in phy_exit(). Shuffle the code slightly to ensure the power is gated to the SS phy as well. Fixes: 32267c29bc7d ("phy: exynos5-usbdrd: support Exynos USBDRD 3.1 combo phy (HS & SS)") CC: stable@vger.kernel.org # 6.11+ Reviewed-by: Krzysztof Kozlowski Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20241205-gs101-usb-phy-fix-v4-1-0278809fb810@linaro.org Signed-off-by: Vinod Koul --- drivers/phy/samsung/phy-exynos5-usbdrd.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/phy/samsung/phy-exynos5-usbdrd.c b/drivers/phy/samsung/phy-exynos5-usbdrd.c index 4a108fdab118c..46b8f6987c62c 100644 --- a/drivers/phy/samsung/phy-exynos5-usbdrd.c +++ b/drivers/phy/samsung/phy-exynos5-usbdrd.c @@ -1296,14 +1296,17 @@ static int exynos5_usbdrd_gs101_phy_exit(struct phy *phy) struct exynos5_usbdrd_phy *phy_drd = to_usbdrd_phy(inst); int ret; + if (inst->phy_cfg->id == EXYNOS5_DRDPHY_UTMI) { + ret = exynos850_usbdrd_phy_exit(phy); + if (ret) + return ret; + } + + exynos5_usbdrd_phy_isol(inst, true); + if (inst->phy_cfg->id != EXYNOS5_DRDPHY_UTMI) return 0; - ret = exynos850_usbdrd_phy_exit(phy); - if (ret) - return ret; - - exynos5_usbdrd_phy_isol(inst, true); return regulator_bulk_disable(phy_drd->drv_data->n_regulators, phy_drd->regulators); } -- GitLab From 5ab90f40121a9f6a9b368274cd92d0f435dc7cfa Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Thu, 23 Jan 2025 12:22:34 -0600 Subject: [PATCH 611/989] phy: ti: gmii-sel: Do not use syscon helper to build regmap The syscon helper device_node_to_regmap() is used to fetch a regmap registered to a device node. It also currently creates this regmap if the node did not already have a regmap associated with it. This should only be used on "syscon" nodes. This driver is not such a device and instead uses device_node_to_regmap() on its own node as a hacky way to create a regmap for itself. This will not work going forward and so we should create our regmap the normal way by defining our regmap_config, fetching our memory resource, then using the normal regmap_init_mmio() function. Signed-off-by: Andrew Davis Tested-by: Nishanth Menon Link: https://lore.kernel.org/r/20250123182234.597665-1-afd@ti.com Signed-off-by: Vinod Koul --- drivers/phy/ti/phy-gmii-sel.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/phy/ti/phy-gmii-sel.c b/drivers/phy/ti/phy-gmii-sel.c index e0ca59ae31531..ff5d5e29629fa 100644 --- a/drivers/phy/ti/phy-gmii-sel.c +++ b/drivers/phy/ti/phy-gmii-sel.c @@ -424,6 +424,12 @@ static int phy_gmii_sel_init_ports(struct phy_gmii_sel_priv *priv) return 0; } +static const struct regmap_config phy_gmii_sel_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, +}; + static int phy_gmii_sel_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -468,7 +474,14 @@ static int phy_gmii_sel_probe(struct platform_device *pdev) priv->regmap = syscon_node_to_regmap(node->parent); if (IS_ERR(priv->regmap)) { - priv->regmap = device_node_to_regmap(node); + void __iomem *base; + + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) + return dev_err_probe(dev, PTR_ERR(base), + "failed to get base memory resource\n"); + + priv->regmap = regmap_init_mmio(dev, base, &phy_gmii_sel_regmap_cfg); if (IS_ERR(priv->regmap)) return dev_err_probe(dev, PTR_ERR(priv->regmap), "Failed to get syscon\n"); -- GitLab From 55f1a5f7c97c3c92ba469e16991a09274410ceb7 Mon Sep 17 00:00:00 2001 From: BH Hsieh Date: Wed, 22 Jan 2025 18:59:43 +0800 Subject: [PATCH 612/989] phy: tegra: xusb: reset VBUS & ID OVERRIDE Observed VBUS_OVERRIDE & ID_OVERRIDE might be programmed with unexpected value prior to XUSB PADCTL driver, this could also occur in virtualization scenario. For example, UEFI firmware programs ID_OVERRIDE=GROUNDED to set a type-c port to host mode and keeps the value to kernel. If the type-c port is connected a usb host, below errors can be observed right after usb host mode driver gets probed. The errors would keep until usb role class driver detects the type-c port as device mode and notifies usb device mode driver to set both ID_OVERRIDE and VBUS_OVERRIDE to correct value by XUSB PADCTL driver. [ 173.765814] usb usb3-port2: Cannot enable. Maybe the USB cable is bad? [ 173.765837] usb usb3-port2: config error Taking virtualization into account, asserting XUSB PADCTL reset would break XUSB functions used by other guest OS, hence only reset VBUS & ID OVERRIDE of the port in utmi_phy_init. Fixes: bbf711682cd5 ("phy: tegra: xusb: Add Tegra186 support") Cc: stable@vger.kernel.org Change-Id: Ic63058d4d49b4a1f8f9ab313196e20ad131cc591 Signed-off-by: BH Hsieh Signed-off-by: Henry Lin Link: https://lore.kernel.org/r/20250122105943.8057-1-henryl@nvidia.com Signed-off-by: Vinod Koul --- drivers/phy/tegra/xusb-tegra186.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/phy/tegra/xusb-tegra186.c b/drivers/phy/tegra/xusb-tegra186.c index 0f60d5d1c1678..fae6242aa730e 100644 --- a/drivers/phy/tegra/xusb-tegra186.c +++ b/drivers/phy/tegra/xusb-tegra186.c @@ -928,6 +928,7 @@ static int tegra186_utmi_phy_init(struct phy *phy) unsigned int index = lane->index; struct device *dev = padctl->dev; int err; + u32 reg; port = tegra_xusb_find_usb2_port(padctl, index); if (!port) { @@ -935,6 +936,16 @@ static int tegra186_utmi_phy_init(struct phy *phy) return -ENODEV; } + if (port->mode == USB_DR_MODE_OTG || + port->mode == USB_DR_MODE_PERIPHERAL) { + /* reset VBUS&ID OVERRIDE */ + reg = padctl_readl(padctl, USB2_VBUS_ID); + reg &= ~VBUS_OVERRIDE; + reg &= ~ID_OVERRIDE(~0); + reg |= ID_OVERRIDE_FLOATING; + padctl_writel(padctl, reg, USB2_VBUS_ID); + } + if (port->supply && port->mode == USB_DR_MODE_HOST) { err = regulator_enable(port->supply); if (err) { -- GitLab From 5755eb0a8168493fddf63d72e0133de54b3a17dd Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Mon, 3 Feb 2025 13:26:31 +1030 Subject: [PATCH 613/989] MAINTAINERS: Mark Andrew as M: for ASPEED MACHINE SUPPORT From discussion in [1] and in-person with Joel, flip my entry from R: to M:. Link: https://lore.kernel.org/all/CACPK8Xe8yZLXzEQPp=1D2f0TsKA7hBZG=pHHW6U51FMpp_BiRQ@mail.gmail.com/ [1] Cc: Joel Stanley Cc: Arnd Bergmann Cc: soc@lists.linux.dev Cc: linux-aspeed@lists.ozlabs.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Andrew Jeffery Acked-by: Joel Stanley Signed-off-by: Arnd Bergmann --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25c86f47353de..e3920abb7794f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2284,7 +2284,7 @@ F: drivers/irqchip/irq-aspeed-i2c-ic.c ARM/ASPEED MACHINE SUPPORT M: Joel Stanley -R: Andrew Jeffery +M: Andrew Jeffery L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-aspeed@lists.ozlabs.org (moderated for non-subscribers) S: Supported -- GitLab From f0570fdc8f72fb1a22bfd2bbf030b34a06da21ff Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Sat, 8 Feb 2025 10:22:01 +0100 Subject: [PATCH 614/989] MAINTAINERS: arm: apple: Add Janne as maintainer Sven and I have agreed to share the maintainership for the ARM/APPLE platform after Marcan's step down. I'm handling the downstream Asahi Linux tree since April 2024 and worked on or wrote several drivers for the platform. Signed-off-by: Janne Grunau Acked-by: Sven Peter Acked-by: Hector Martin Acked-by: Neal Gompa Link: https://lore.kernel.org/r/20250208-maint-soc-apple-v1-1-a7f7337baec0@jannau.net Signed-off-by: Arnd Bergmann --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index e3920abb7794f..04c90ecf616a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2210,6 +2210,7 @@ F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT M: Sven Peter +M: Janne Grunau R: Alyssa Rosenzweig L: asahi@lists.linux.dev L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -- GitLab From be6686b823b30a69b1f71bde228ce042c78a1941 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 5 Feb 2025 15:41:43 +0100 Subject: [PATCH 615/989] firmware: imx: IMX_SCMI_MISC_DRV should depend on ARCH_MXC The i.MX System Controller Management Interface firmware is only present on Freescale i.MX SoCs. Hence add a dependency on ARCH_MXC, to prevent asking the user about this driver when configuring a kernel without Freescale i.MX platform support. Fixes: 514b2262ade48a05 ("firmware: arm_scmi: Fix i.MX build dependency") Signed-off-by: Geert Uytterhoeven Reviewed-by: Fabio Estevam Signed-off-by: Arnd Bergmann --- drivers/firmware/imx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/imx/Kconfig b/drivers/firmware/imx/Kconfig index 907cd149c40a8..c964f4924359f 100644 --- a/drivers/firmware/imx/Kconfig +++ b/drivers/firmware/imx/Kconfig @@ -25,6 +25,7 @@ config IMX_SCU config IMX_SCMI_MISC_DRV tristate "IMX SCMI MISC Protocol driver" + depends on ARCH_MXC || COMPILE_TEST default y if ARCH_MXC help The System Controller Management Interface firmware (SCMI FW) is -- GitLab From dd0f05b98925111f4530d7dab774398cdb32e9e3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 14 Feb 2025 09:31:29 +0100 Subject: [PATCH 616/989] platform: cznic: CZNIC_PLATFORMS should depend on ARCH_MVEBU CZ.NIC's Turris devices are based on Marvell EBU SoCs. Hence add a dependency on ARCH_MVEBU, to prevent asking the user about these drivers when configuring a kernel that cannot run on an affected CZ.NIC Turris system. Fixes: 992f1a3d4e88498d ("platform: cznic: Add preliminary support for Turris Omnia MCU") Signed-off-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann --- drivers/platform/cznic/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index 49c383eb67854..13e37b49d9d01 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -6,6 +6,7 @@ menuconfig CZNIC_PLATFORMS bool "Platform support for CZ.NIC's Turris hardware" + depends on ARCH_MVEBU || COMPILE_TEST help Say Y here to be able to choose driver support for CZ.NIC's Turris devices. This option alone does not add any kernel code. -- GitLab From 70b0d6b0a199c5a3ee6c72f5e61681ed6f759612 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Tue, 4 Feb 2025 13:04:18 +0530 Subject: [PATCH 617/989] tee: optee: Fix supplicant wait loop OP-TEE supplicant is a user-space daemon and it's possible for it be hung or crashed or killed in the middle of processing an OP-TEE RPC call. It becomes more complicated when there is incorrect shutdown ordering of the supplicant process vs the OP-TEE client application which can eventually lead to system hang-up waiting for the closure of the client application. Allow the client process waiting in kernel for supplicant response to be killed rather than indefinitely waiting in an unkillable state. Also, a normal uninterruptible wait should not have resulted in the hung-task watchdog getting triggered, but the endless loop would. This fixes issues observed during system reboot/shutdown when supplicant got hung for some reason or gets crashed/killed which lead to client getting hung in an unkillable state. It in turn lead to system being in hung up state requiring hard power off/on to recover. Fixes: 4fb0a5eb364d ("tee: add OP-TEE driver") Suggested-by: Arnd Bergmann Cc: stable@vger.kernel.org Signed-off-by: Sumit Garg Reviewed-by: Arnd Bergmann Reviewed-by: Jens Wiklander Signed-off-by: Arnd Bergmann --- drivers/tee/optee/supp.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c index 322a543b8c278..d0f397c902420 100644 --- a/drivers/tee/optee/supp.c +++ b/drivers/tee/optee/supp.c @@ -80,7 +80,6 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params, struct optee *optee = tee_get_drvdata(ctx->teedev); struct optee_supp *supp = &optee->supp; struct optee_supp_req *req; - bool interruptable; u32 ret; /* @@ -111,36 +110,18 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params, /* * Wait for supplicant to process and return result, once we've * returned from wait_for_completion(&req->c) successfully we have - * exclusive access again. + * exclusive access again. Allow the wait to be killable such that + * the wait doesn't turn into an indefinite state if the supplicant + * gets hung for some reason. */ - while (wait_for_completion_interruptible(&req->c)) { + if (wait_for_completion_killable(&req->c)) { mutex_lock(&supp->mutex); - interruptable = !supp->ctx; - if (interruptable) { - /* - * There's no supplicant available and since the - * supp->mutex currently is held none can - * become available until the mutex released - * again. - * - * Interrupting an RPC to supplicant is only - * allowed as a way of slightly improving the user - * experience in case the supplicant hasn't been - * started yet. During normal operation the supplicant - * will serve all requests in a timely manner and - * interrupting then wouldn't make sense. - */ - if (req->in_queue) { - list_del(&req->link); - req->in_queue = false; - } + if (req->in_queue) { + list_del(&req->link); + req->in_queue = false; } mutex_unlock(&supp->mutex); - - if (interruptable) { - req->ret = TEEC_ERROR_COMMUNICATION; - break; - } + req->ret = TEEC_ERROR_COMMUNICATION; } ret = req->ret; -- GitLab From b3fefbb30a1691533cb905006b69b2a474660744 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 24 Jan 2025 19:15:23 +0100 Subject: [PATCH 618/989] nouveau/svm: fix missing folio unlock + put after make_device_exclusive_range() In case we have to retry the loop, we are missing to unlock+put the folio. In that case, we will keep failing make_device_exclusive_range() because we cannot grab the folio lock, and even return from the function with the folio locked and referenced, effectively never succeeding the make_device_exclusive_range(). While at it, convert the other unlock+put to use a folio as well. This was found by code inspection. Fixes: 8f187163eb89 ("nouveau/svm: implement atomic SVM access") Signed-off-by: David Hildenbrand Reviewed-by: Alistair Popple Tested-by: Alistair Popple Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20250124181524.3584236-2-david@redhat.com --- drivers/gpu/drm/nouveau/nouveau_svm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index b4da82ddbb6b2..8ea98f06d39af 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -590,6 +590,7 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, unsigned long timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); struct mm_struct *mm = svmm->notifier.mm; + struct folio *folio; struct page *page; unsigned long start = args->p.addr; unsigned long notifier_seq; @@ -616,12 +617,16 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, ret = -EINVAL; goto out; } + folio = page_folio(page); mutex_lock(&svmm->mutex); if (!mmu_interval_read_retry(¬ifier->notifier, notifier_seq)) break; mutex_unlock(&svmm->mutex); + + folio_unlock(folio); + folio_put(folio); } /* Map the page on the GPU. */ @@ -637,8 +642,8 @@ static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm, ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL); mutex_unlock(&svmm->mutex); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: mmu_interval_notifier_remove(¬ifier->notifier); -- GitLab From 80e648042e512d5a767da251d44132553fe04ae0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Feb 2025 02:39:50 +0100 Subject: [PATCH 619/989] partitions: mac: fix handling of bogus partition table Fix several issues in partition probing: - The bailout for a bad partoffset must use put_dev_sector(), since the preceding read_part_sector() succeeded. - If the partition table claims a silly sector size like 0xfff bytes (which results in partition table entries straddling sector boundaries), bail out instead of accessing out-of-bounds memory. - We must not assume that the partition table contains proper NUL termination - use strnlen() and strncmp() instead of strlen() and strcmp(). Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20250214-partition-mac-v1-1-c1c626dffbd5@google.com Signed-off-by: Jens Axboe --- block/partitions/mac.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/block/partitions/mac.c b/block/partitions/mac.c index c80183156d680..b02530d986297 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -53,13 +53,25 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); + + /* + * If the "block size" is not a power of 2, things get weird - we might + * end up with a partition straddling a sector boundary, so we wouldn't + * be able to read a partition entry with read_part_sector(). + * Real block sizes are probably (?) powers of two, so just require + * that. + */ + if (!is_power_of_2(secsize)) + return -1; datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; - if (partoffset + sizeof(*part) > datasize) + if (partoffset + sizeof(*part) > datasize) { + put_dev_sector(sect); return -1; + } part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); @@ -112,8 +124,8 @@ int mac_partition(struct parsed_partitions *state) int i, l; goodness++; - l = strlen(part->name); - if (strcmp(part->name, "/") == 0) + l = strnlen(part->name, sizeof(part->name)); + if (strncmp(part->name, "/", sizeof(part->name)) == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", -- GitLab From 8221fd1a73044adef712a5c9346a23c2447f629c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 14 Feb 2025 16:43:49 +0000 Subject: [PATCH 620/989] workqueue: Log additional details when rejecting work Syzbot regularly runs into the following warning on arm64: | WARNING: CPU: 1 PID: 6023 at kernel/workqueue.c:2257 current_wq_worker kernel/workqueue_internal.h:69 [inline] | WARNING: CPU: 1 PID: 6023 at kernel/workqueue.c:2257 is_chained_work kernel/workqueue.c:2199 [inline] | WARNING: CPU: 1 PID: 6023 at kernel/workqueue.c:2257 __queue_work+0xe50/0x1308 kernel/workqueue.c:2256 | Modules linked in: | CPU: 1 UID: 0 PID: 6023 Comm: klogd Not tainted 6.13.0-rc2-syzkaller-g2e7aff49b5da #0 | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 | pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : __queue_work+0xe50/0x1308 kernel/workqueue_internal.h:69 | lr : current_wq_worker kernel/workqueue_internal.h:69 [inline] | lr : is_chained_work kernel/workqueue.c:2199 [inline] | lr : __queue_work+0xe50/0x1308 kernel/workqueue.c:2256 [...] | __queue_work+0xe50/0x1308 kernel/workqueue.c:2256 (L) | delayed_work_timer_fn+0x74/0x90 kernel/workqueue.c:2485 | call_timer_fn+0x1b4/0x8b8 kernel/time/timer.c:1793 | expire_timers kernel/time/timer.c:1839 [inline] | __run_timers kernel/time/timer.c:2418 [inline] | __run_timer_base+0x59c/0x7b4 kernel/time/timer.c:2430 | run_timer_base kernel/time/timer.c:2439 [inline] | run_timer_softirq+0xcc/0x194 kernel/time/timer.c:2449 The warning is probably because we are trying to queue work into a destroyed workqueue, but the softirq context makes it hard to pinpoint the problematic caller. Extend the warning diagnostics to print both the function we are trying to queue as well as the name of the workqueue. Cc: Tejun Heo Cc: Catalin Marinas Cc: Marc Zyngier Cc: Lai Jiangshan Link: https://syzkaller.appspot.com/bug?extid=e13e654d315d4da1277c Signed-off-by: Will Deacon Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ccad33001c58c..902df3253598c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2254,8 +2254,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq)))) + WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", + work->func, wq->name))) { return; + } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ -- GitLab From 9ba0e1755a40f9920ad0f4168031291b3eb58d7b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:19:57 -0500 Subject: [PATCH 621/989] ring-buffer: Unlock resize on mmap error Memory mapping the tracing ring buffer will disable resizing the buffer. But if there's an error in the memory mapping like an invalid parameter, the function exits out without re-enabling the resizing of the ring buffer, preventing the ring buffer from being resized after that. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213131957.530ec3c5@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e0ae15ca5b6..07b421115692c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7126,6 +7126,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } unlock: -- GitLab From 60b8f711143de7cd9c0f55be0fe7eb94b19eb5c7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Feb 2025 13:41:32 -0500 Subject: [PATCH 622/989] tracing: Have the error of __tracing_resize_ring_buffer() passed to user Currently if __tracing_resize_ring_buffer() returns an error, the tracing_resize_ringbuffer() returns -ENOMEM. But it may not be a memory issue that caused the function to fail. If the ring buffer is memory mapped, then the resizing of the ring buffer will be disabled. But if the user tries to resize the buffer, it will get an -ENOMEM returned, which is confusing because there is plenty of memory. The actual error returned was -EBUSY, which would make much more sense to the user. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250213134132.7e4505d7@gandalf.local.home Fixes: 117c39200d9d7 ("ring-buffer: Introducing ring-buffer mapping functions") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Masami Hiramatsu (Google) --- kernel/trace/trace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1496a5ac33ae1..25ff37aab00f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5977,8 +5977,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { @@ -5987,11 +5985,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; - - return ret; + return __tracing_resize_ring_buffer(tr, size, cpu_id); } static void update_last_data(struct trace_array *tr) -- GitLab From f5b95f1fa2ef3a03f49eeec658ba97e721412b32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 10:28:20 -0500 Subject: [PATCH 623/989] ring-buffer: Validate the persistent meta data subbuf array The meta data for a mapped ring buffer contains an array of indexes of all the subbuffers. The first entry is the reader page, and the rest of the entries lay out the order of the subbuffers in how the ring buffer link list is to be created. The validator currently makes sure that all the entries are within the range of 0 and nr_subbufs. But it does not check if there are any duplicates. While working on the ring buffer, I corrupted this array, where I added duplicates. The validator did not catch it and created the ring buffer link list on top of it. Luckily, the corruption was only that the reader page was also in the writer path and only presented corrupted data but did not crash the kernel. But if there were duplicates in the writer side, then it could corrupt the ring buffer link list and cause a crash. Create a bitmask array with the size of the number of subbuffers. Then clear it. When walking through the subbuf array checking to see if the entries are within the range, test if its bit is already set in the subbuf_mask. If it is, then there is duplicates and fail the validation. If not, set the corresponding bit and continue. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214102820.7509ddea@gandalf.local.home Fixes: c76883f18e59b ("ring-buffer: Add test if range of boot buffer is valid") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 07b421115692c..0419d41a20604 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + if (!subbuf_mask) + return false; + /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { @@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } @@ -1889,17 +1901,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1943,6 +1960,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) -- GitLab From 41758630dd7ea9dce3eb152168c979534b415ab0 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Wed, 12 Feb 2025 19:25:34 +0800 Subject: [PATCH 624/989] dt-bindings: mtd: cadence: document required clock-names The clock-names property is required because the driver requests the clock by name and not the index. Update the example to use &clk instead of &nf_clk for the clocks property to avoid confusion with the clock-names property "nf_clk". Fixes: 1f05f823a16c (dt-bindings: mtd: cadence: convert cadence-nand-controller.txt to yaml) Signed-off-by: Niravkumar L Rabara Reviewed-by: Krzysztof Kozlowski Signed-off-by: Miquel Raynal --- Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml b/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml index 0bed37a994c38..e1f4d7c35a885 100644 --- a/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml +++ b/Documentation/devicetree/bindings/mtd/cdns,hp-nfc.yaml @@ -33,6 +33,10 @@ properties: clocks: maxItems: 1 + clock-names: + items: + - const: nf_clk + dmas: maxItems: 1 @@ -51,6 +55,7 @@ required: - reg-names - interrupts - clocks + - clock-names unevaluatedProperties: false @@ -66,7 +71,8 @@ examples: #address-cells = <1>; #size-cells = <0>; interrupts = ; - clocks = <&nf_clk>; + clocks = <&clk>; + clock-names = "nf_clk"; cdns,board-delay-ps = <4830>; nand@0 { -- GitLab From 77b823fa619f97d16409ca37ad4f7936e28c5f83 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:22 +0100 Subject: [PATCH 625/989] alpha: replace hardcoded stack offsets with autogenerated ones This allows the assembly in entry.S to automatically keep in sync with changes in the stack layout (struct pt_regs and struct switch_stack). Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/kernel/asm-offsets.c | 4 ++++ arch/alpha/kernel/entry.S | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c index 4cfeae42c79ac..e9dad60b147f3 100644 --- a/arch/alpha/kernel/asm-offsets.c +++ b/arch/alpha/kernel/asm-offsets.c @@ -19,9 +19,13 @@ static void __used foo(void) DEFINE(TI_STATUS, offsetof(struct thread_info, status)); BLANK(); + DEFINE(SP_OFF, offsetof(struct pt_regs, ps)); DEFINE(SIZEOF_PT_REGS, sizeof(struct pt_regs)); BLANK(); + DEFINE(SWITCH_STACK_SIZE, sizeof(struct switch_stack)); + BLANK(); + DEFINE(HAE_CACHE, offsetof(struct alpha_machine_vector, hae_cache)); DEFINE(HAE_REG, offsetof(struct alpha_machine_vector, hae_register)); } diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index dd26062d75b3c..6fb38365539d4 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -15,10 +15,6 @@ .set noat .cfi_sections .debug_frame -/* Stack offsets. */ -#define SP_OFF 184 -#define SWITCH_STACK_SIZE 64 - .macro CFI_START_OSF_FRAME func .align 4 .globl \func -- GitLab From 0a0f7362b0367634a2d5cb7c96226afc116f19c9 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:23 +0100 Subject: [PATCH 626/989] alpha: make stack 16-byte aligned (most cases) The problem is that GCC expects 16-byte alignment of the incoming stack since early 2004, as Maciej found out [1]: Having actually dug speculatively I can see that the psABI was changed in GCC 3.5 with commit e5e10fb4a350 ("re PR target/14539 (128-bit long double improperly aligned)") back in Mar 2004, when the stack pointer alignment was increased from 8 bytes to 16 bytes, and arch/alpha/kernel/entry.S has various suspicious stack pointer adjustments, starting with SP_OFF which is not a whole multiple of 16. Also, as Magnus noted, "ALPHA Calling Standard" [2] required the same: D.3.1 Stack Alignment This standard requires that stacks be octaword aligned at the time a new procedure is invoked. However: - the "normal" kernel stack is always misaligned by 8 bytes, thanks to the odd number of 64-bit words in 'struct pt_regs', which is the very first thing pushed onto the kernel thread stack; - syscall, fault, interrupt etc. handlers may, or may not, receive aligned stack depending on numerous factors. Somehow we got away with it until recently, when we ended up with a stack corruption in kernel/smp.c:smp_call_function_single() due to its use of 32-byte aligned local data and the compiler doing clever things allocating it on the stack. This adds padding between the PAL-saved and kernel-saved registers so that 'struct pt_regs' have an even number of 64-bit words. This makes the stack properly aligned for most of the kernel code, except two handlers which need special threatment. Note: struct pt_regs doesn't belong in uapi/asm; this should be fixed, but let's put this off until later. Link: https://lore.kernel.org/rcu/alpine.DEB.2.21.2501130248010.18889@angie.orcam.me.uk/ [1] Link: https://bitsavers.org/pdf/dec/alpha/Alpha_Calling_Standard_Rev_2.0_19900427.pdf [2] Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Magnus Lindholm Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/include/uapi/asm/ptrace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/alpha/include/uapi/asm/ptrace.h b/arch/alpha/include/uapi/asm/ptrace.h index 5ca45934fcbb8..72ed913a910f2 100644 --- a/arch/alpha/include/uapi/asm/ptrace.h +++ b/arch/alpha/include/uapi/asm/ptrace.h @@ -42,6 +42,8 @@ struct pt_regs { unsigned long trap_a0; unsigned long trap_a1; unsigned long trap_a2; +/* This makes the stack 16-byte aligned as GCC expects */ + unsigned long __pad0; /* These are saved by PAL-code: */ unsigned long ps; unsigned long pc; -- GitLab From 3b35a171060f846b08b48646b38c30b5d57d17ff Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Tue, 4 Feb 2025 23:35:24 +0100 Subject: [PATCH 627/989] alpha: align stack for page fault and user unaligned trap handlers do_page_fault() and do_entUna() are special because they use non-standard stack frame layout. Fix them manually. Cc: stable@vger.kernel.org Tested-by: Maciej W. Rozycki Tested-by: Magnus Lindholm Tested-by: Matt Turner Reviewed-by: Maciej W. Rozycki Suggested-by: Maciej W. Rozycki Signed-off-by: Ivan Kokshaysky Signed-off-by: Matt Turner --- arch/alpha/kernel/entry.S | 20 ++++++++++---------- arch/alpha/kernel/traps.c | 2 +- arch/alpha/mm/fault.c | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 6fb38365539d4..f4d41b4538c2e 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -194,8 +194,8 @@ CFI_END_OSF_FRAME entArith CFI_START_OSF_FRAME entMM SAVE_ALL /* save $9 - $15 so the inline exception code can manipulate them. */ - subq $sp, 56, $sp - .cfi_adjust_cfa_offset 56 + subq $sp, 64, $sp + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -210,7 +210,7 @@ CFI_START_OSF_FRAME entMM .cfi_rel_offset $13, 32 .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 - addq $sp, 56, $19 + addq $sp, 64, $19 /* handle the fault */ lda $8, 0x3fff bic $sp, $8, $8 @@ -223,7 +223,7 @@ CFI_START_OSF_FRAME entMM ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - addq $sp, 56, $sp + addq $sp, 64, $sp .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -231,7 +231,7 @@ CFI_START_OSF_FRAME entMM .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 /* finish up the syscall as normal. */ br ret_from_sys_call CFI_END_OSF_FRAME entMM @@ -378,8 +378,8 @@ entUnaUser: .cfi_restore $0 .cfi_adjust_cfa_offset -256 SAVE_ALL /* setup normal kernel stack */ - lda $sp, -56($sp) - .cfi_adjust_cfa_offset 56 + lda $sp, -64($sp) + .cfi_adjust_cfa_offset 64 stq $9, 0($sp) stq $10, 8($sp) stq $11, 16($sp) @@ -395,7 +395,7 @@ entUnaUser: .cfi_rel_offset $14, 40 .cfi_rel_offset $15, 48 lda $8, 0x3fff - addq $sp, 56, $19 + addq $sp, 64, $19 bic $sp, $8, $8 jsr $26, do_entUnaUser ldq $9, 0($sp) @@ -405,7 +405,7 @@ entUnaUser: ldq $13, 32($sp) ldq $14, 40($sp) ldq $15, 48($sp) - lda $sp, 56($sp) + lda $sp, 64($sp) .cfi_restore $9 .cfi_restore $10 .cfi_restore $11 @@ -413,7 +413,7 @@ entUnaUser: .cfi_restore $13 .cfi_restore $14 .cfi_restore $15 - .cfi_adjust_cfa_offset -56 + .cfi_adjust_cfa_offset -64 br ret_from_sys_call CFI_END_OSF_FRAME entUna diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index a9a38c80c4a7a..7004397937cfd 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -649,7 +649,7 @@ s_reg_to_mem (unsigned long s_reg) static int unauser_reg_offsets[32] = { R(r0), R(r1), R(r2), R(r3), R(r4), R(r5), R(r6), R(r7), R(r8), /* r9 ... r15 are stored in front of regs. */ - -56, -48, -40, -32, -24, -16, -8, + -64, -56, -48, -40, -32, -24, -16, /* padding at -8 */ R(r16), R(r17), R(r18), R(r19), R(r20), R(r21), R(r22), R(r23), R(r24), R(r25), R(r26), R(r27), R(r28), R(gp), diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 8c9850437e674..a9816bbc9f34d 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -78,8 +78,8 @@ __load_new_mm_context(struct mm_struct *next_mm) /* Macro for exception fixup code to access integer registers. */ #define dpf_reg(r) \ - (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \ - (r) <= 18 ? (r)+10 : (r)-10]) + (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-17 : \ + (r) <= 18 ? (r)+11 : (r)-10]) asmlinkage void do_page_fault(unsigned long address, unsigned long mmcsr, -- GitLab From 757f051a506198186d796dff4ba696adb7bda54c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 7 Jan 2025 11:43:42 +0100 Subject: [PATCH 628/989] alpha: Replace one-element array with flexible array member Replace the deprecated one-element array with a modern flexible array member in the struct crb_struct. Reviewed-by: Kees Cook Signed-off-by: Thorsten Blum Signed-off-by: Matt Turner --- arch/alpha/include/asm/hwrpb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/alpha/include/asm/hwrpb.h b/arch/alpha/include/asm/hwrpb.h index fc76f36265ad1..db831cf8de108 100644 --- a/arch/alpha/include/asm/hwrpb.h +++ b/arch/alpha/include/asm/hwrpb.h @@ -135,7 +135,7 @@ struct crb_struct { /* virtual->physical map */ unsigned long map_entries; unsigned long map_pages; - struct vf_map_struct map[1]; + struct vf_map_struct map[]; }; struct memclust_struct { -- GitLab From 1523226edda566057bdd3264ceb56631ddf5f6f7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 12 Feb 2025 12:14:47 +0100 Subject: [PATCH 629/989] alpha: Use str_yes_no() helper in pci_dac_dma_supported() Remove hard-coded strings by using the str_yes_no() helper function. Reviewed-by: Geert Uytterhoeven Signed-off-by: Thorsten Blum Signed-off-by: Matt Turner --- arch/alpha/kernel/pci_iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 681f56089d9ce..dc91de50f906d 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -212,7 +213,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask) /* If both conditions above are met, we are fine. */ DBGA("pci_dac_dma_supported %s from %ps\n", - ok ? "yes" : "no", __builtin_return_address(0)); + str_yes_no(ok), __builtin_return_address(0)); return ok; } -- GitLab From c158647c107358bf1be579f98e4bb705c1953292 Mon Sep 17 00:00:00 2001 From: Komal Bajaj Date: Tue, 19 Nov 2024 12:16:08 +0530 Subject: [PATCH 630/989] EDAC/qcom: Correct interrupt enable register configuration The previous implementation incorrectly configured the cmn_interrupt_2_enable register for interrupt handling. Using cmn_interrupt_2_enable to configure Tag, Data RAM ECC interrupts would lead to issues like double handling of the interrupts (EL1 and EL3) as cmn_interrupt_2_enable is meant to be configured for interrupts which needs to be handled by EL3. EL1 LLCC EDAC driver needs to use cmn_interrupt_0_enable register to configure Tag, Data RAM ECC interrupts instead of cmn_interrupt_2_enable. Fixes: 27450653f1db ("drivers: edac: Add EDAC driver support for QCOM SoCs") Signed-off-by: Komal Bajaj Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Manivannan Sadhasivam Cc: Link: https://lore.kernel.org/r/20241119064608.12326-1-quic_kbajaj@quicinc.com --- drivers/edac/qcom_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c index 04c42c83a2bad..f3da9385ca0d8 100644 --- a/drivers/edac/qcom_edac.c +++ b/drivers/edac/qcom_edac.c @@ -95,7 +95,7 @@ static int qcom_llcc_core_setup(struct llcc_drv_data *drv, struct regmap *llcc_b * Configure interrupt enable registers such that Tag, Data RAM related * interrupts are propagated to interrupt controller for servicing */ - ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_2_enable, + ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_0_enable, TRP0_INTERRUPT_ENABLE, TRP0_INTERRUPT_ENABLE); if (ret) @@ -113,7 +113,7 @@ static int qcom_llcc_core_setup(struct llcc_drv_data *drv, struct regmap *llcc_b if (ret) return ret; - ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_2_enable, + ret = regmap_update_bits(llcc_bcast_regmap, drv->edac_reg_offset->cmn_interrupt_0_enable, DRP0_INTERRUPT_ENABLE, DRP0_INTERRUPT_ENABLE); if (ret) -- GitLab From fb8179ce2996bffaa36a04e2b6262843b01b7565 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 4 Nov 2024 13:03:13 -0600 Subject: [PATCH 631/989] riscv: cacheinfo: Use of_property_present() for non-boolean properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of of_property_read_bool() for non-boolean properties is deprecated in favor of of_property_present() when testing for property presence. Signed-off-by: Rob Herring (Arm) Reviewed-by: Clément Léger Cc: stable@vger.kernel.org Fixes: 76d2a0493a17 ("RISC-V: Init and Halt Code") Link: https://lore.kernel.org/r/20241104190314.270095-1-robh@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 2d40736fc37ce..26b085dbdd073 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -108,11 +108,11 @@ int populate_cache_leaves(unsigned int cpu) if (!np) return -ENOENT; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); prev = np; @@ -125,11 +125,11 @@ int populate_cache_leaves(unsigned int cpu) break; if (level <= levels) break; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); levels = level; } -- GitLab From c6ec1e1b078d8e2ecd075e46db6197a14930a3fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 10 Feb 2025 16:56:14 +0100 Subject: [PATCH 632/989] riscv: cpufeature: use bitmap_equal() instead of memcmp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comparison of bitmaps should be done using bitmap_equal(), not memcmp(), use the former one to compare isa bitmaps. Signed-off-by: Clément Léger Fixes: 625034abd52a8c ("riscv: add ISA extensions validation callback") Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250210155615.1545738-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index c6ba750536c32..40ac72e407b68 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -479,7 +479,7 @@ static void __init riscv_resolve_isa(unsigned long *source_isa, if (bit < RISCV_ISA_EXT_BASE) *this_hwcap |= isa2hwcap[bit]; } - } while (loop && memcmp(prev_resolved_isa, resolved_isa, sizeof(prev_resolved_isa))); + } while (loop && !bitmap_equal(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX)); } static void __init match_isa_ext(const char *name, const char *name_end, unsigned long *bitmap) -- GitLab From 1898300abf3508bca152e65b36cce5bf93d7e63e Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Thu, 30 Jan 2025 10:25:38 +0100 Subject: [PATCH 633/989] riscv/atomic: Do proper sign extension also for unsigned in arch_cmpxchg Sign extend also an unsigned compare value to match what lr.w is doing. Otherwise try_cmpxchg may spuriously return true when used on a u32 value that has the sign bit set, as it happens often in inode_set_ctime_current. Do this in three conversion steps. The first conversion to long is needed to avoid a -Wpointer-to-int-cast warning when arch_cmpxchg is used with a pointer type. Then convert to int and back to long to always sign extend the 32-bit value to 64-bit. Fixes: 6c58f25e6938 ("riscv/atomic: Fix sign extension for RV64I") Signed-off-by: Andreas Schwab Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Tested-by: Xi Ruoyao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/mvmed0k4prh.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cmpxchg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 4cadc56220fea..427c41dde6431 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -231,7 +231,7 @@ __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx, \ sc_prepend, sc_append, \ cas_prepend, cas_append, \ - __ret, __ptr, (long), __old, __new); \ + __ret, __ptr, (long)(int)(long), __old, __new); \ break; \ case 8: \ __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx, \ -- GitLab From 599c44cd21f4967774e0acf58f734009be4aea9a Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 3 Feb 2025 11:06:00 +0100 Subject: [PATCH 634/989] riscv/futex: sign extend compare value in atomic cmpxchg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure the compare value in the lr/sc loop is sign extended to match what lr.w does. Fortunately, due to the compiler keeping the register contents sign extended anyway the lack of the explicit extension didn't result in wrong code so far, but this cannot be relied upon. Fixes: b90edb33010b ("RISC-V: Add futex support.") Signed-off-by: Andreas Schwab Reviewed-by: Alexandre Ghiti Reviewed-by: Björn Töpel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/mvmfrkv2vhz.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/futex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 72be100afa236..90c86b115e008 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -93,7 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %[r]) \ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %[r]) \ : [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr), [t] "=&r" (tmp) - : [ov] "Jr" (oldval), [nv] "Jr" (newval) + : [ov] "Jr" ((long)(int)oldval), [nv] "Jr" (newval) : "memory"); __disable_user_access(); -- GitLab From 713e788c0e07e185fd44dd581f74855ef149722f Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 14 Jan 2025 17:07:21 +0000 Subject: [PATCH 635/989] rseq/selftests: Fix riscv rseq_offset_deref_addv inline asm When working on OpenRISC support for restartable sequences I noticed and fixed these two issues with the riscv support bits. 1 The 'inc' argument to RSEQ_ASM_OP_R_DEREF_ADDV was being implicitly passed to the macro. Fix this by adding 'inc' to the list of macro arguments. 2 The inline asm input constraints for 'inc' and 'off' use "er", The riscv gcc port does not have an "e" constraint, this looks to be copied from the x86 port. Fix this by just using an "r" constraint. I have compile tested this only for riscv. However, the same fixes I use in the OpenRISC rseq selftests and everything passes with no issues. Fixes: 171586a6ab66 ("selftests/rseq: riscv: Template memory ordering and percpu access mode") Signed-off-by: Stafford Horne Tested-by: Charlie Jenkins Reviewed-by: Charlie Jenkins Reviewed-by: Mathieu Desnoyers Acked-by: Shuah Khan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250114170721.3613280-1-shorne@gmail.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/rseq/rseq-riscv-bits.h | 6 +++--- tools/testing/selftests/rseq/rseq-riscv.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq-riscv-bits.h b/tools/testing/selftests/rseq/rseq-riscv-bits.h index de31a0143139b..f02f411d550d1 100644 --- a/tools/testing/selftests/rseq/rseq-riscv-bits.h +++ b/tools/testing/selftests/rseq/rseq-riscv-bits.h @@ -243,7 +243,7 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i #ifdef RSEQ_COMPARE_TWICE RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, "%l[error1]") #endif - RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, 3) + RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, 3) RSEQ_INJECT_ASM(4) RSEQ_ASM_DEFINE_ABORT(4, abort) : /* gcc asm goto does not allow outputs */ @@ -251,8 +251,8 @@ int RSEQ_TEMPLATE_IDENTIFIER(rseq_offset_deref_addv)(intptr_t *ptr, off_t off, i [current_cpu_id] "m" (rseq_get_abi()->RSEQ_TEMPLATE_CPU_ID_FIELD), [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr), [ptr] "r" (ptr), - [off] "er" (off), - [inc] "er" (inc) + [off] "r" (off), + [inc] "r" (inc) RSEQ_INJECT_INPUT : "memory", RSEQ_ASM_TMP_REG_1 RSEQ_INJECT_CLOBBER diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h index 37e598d0a365e..67d544aaa9a3b 100644 --- a/tools/testing/selftests/rseq/rseq-riscv.h +++ b/tools/testing/selftests/rseq/rseq-riscv.h @@ -158,7 +158,7 @@ do { \ "bnez " RSEQ_ASM_TMP_REG_1 ", 222b\n" \ "333:\n" -#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, post_commit_label) \ +#define RSEQ_ASM_OP_R_DEREF_ADDV(ptr, off, inc, post_commit_label) \ "mv " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(ptr) "]\n" \ RSEQ_ASM_OP_R_ADD(off) \ REG_L RSEQ_ASM_TMP_REG_1 ", 0(" RSEQ_ASM_TMP_REG_1 ")\n" \ -- GitLab From aa49bc2ca8524186ceb0811c23a7f00c3dea6987 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 16:39:23 +0800 Subject: [PATCH 636/989] riscv: signal: fix signal frame size The signal context of certain RISC-V extensions will be appended after struct __riscv_extra_ext_header, which already includes an empty context header. Therefore, there is no need to preserve a separate hdr for the END of signal context. Fixes: 8ee0b41898fa ("riscv: signal: Add sigcontext save/restore for vector") Signed-off-by: Yong-Xuan Wang Reviewed-by: Zong Li Reviewed-by: Andy Chiu Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241220083926.19453-2-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 94e905eea1dee..08378fea3a111 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -215,12 +215,6 @@ static size_t get_rt_frame_size(bool cal_all) if (cal_all || riscv_v_vstate_query(task_pt_regs(current))) total_context_size += riscv_v_sc_size; } - /* - * Preserved a __riscv_ctx_hdr for END signal context header if an - * extension uses __riscv_extra_ext_header - */ - if (total_context_size) - total_context_size += sizeof(struct __riscv_ctx_hdr); frame_size += total_context_size; -- GitLab From 564fc8eb6f78e01292ff10801f318feae6153fdd Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 16:39:24 +0800 Subject: [PATCH 637/989] riscv: signal: fix signal_minsigstksz The init_rt_signal_env() funciton is called before the alternative patch is applied, so using the alternative-related API to check the availability of an extension within this function doesn't have the intended effect. This patch reorders the init_rt_signal_env() and apply_boot_alternatives() to get the correct signal_minsigstksz. Fixes: e92f469b0771 ("riscv: signal: Report signal frame size to userspace via auxv") Signed-off-by: Yong-Xuan Wang Reviewed-by: Zong Li Reviewed-by: Andy Chiu Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241220083926.19453-3-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f1793630fc518..4fe45daa6281e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -322,8 +322,8 @@ void __init setup_arch(char **cmdline_p) riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); - init_rt_signal_env(); apply_boot_alternatives(); + init_rt_signal_env(); if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && riscv_isa_extension_available(NULL, ZICBOM)) -- GitLab From 245aece3750d3692ae7a44516c1096936bded7ab Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 12 Dec 2024 14:11:34 +0100 Subject: [PATCH 638/989] MAINTAINERS: Add myself as a riscv reviewer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal is for me to get a kernel.org account and then send pull requests in order to relieve some pressure from Palmer and make our workflow smoother. Signed-off-by: Alexandre Ghiti Enthusiastically-Supported-by: Björn Töpel Acked-by: Conor Dooley Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241212131134.288819-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 896a307fa0654..ebc3e39dbcaab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20280,6 +20280,7 @@ RISC-V ARCHITECTURE M: Paul Walmsley M: Palmer Dabbelt M: Albert Ou +R: Alexandre Ghiti L: linux-riscv@lists.infradead.org S: Supported Q: https://patchwork.kernel.org/project/linux-riscv/list/ -- GitLab From 054e61bb1de4fa02d148344152007facbcb28583 Mon Sep 17 00:00:00 2001 From: Jeroen de Borst Date: Thu, 13 Feb 2025 10:45:23 -0800 Subject: [PATCH 639/989] gve: Update MAINTAINERS Updating MAINTAINERS to include active contributers. Signed-off-by: Jeroen de Borst Link: https://patch.msgid.link/20250213184523.2002582-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 10893c91b1c10..988b0ff94fda9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9834,8 +9834,8 @@ F: drivers/input/touchscreen/goodix* GOOGLE ETHERNET DRIVERS M: Jeroen de Borst -M: Praveen Kaligineedi -R: Shailend Chand +M: Joshua Washington +M: Harshitha Ramamurthy L: netdev@vger.kernel.org S: Maintained F: Documentation/networking/device_drivers/ethernet/google/gve.rst -- GitLab From 0d1fac6d26aff5df21bb4ec980d9b7a11c410b96 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 12 Feb 2025 12:15:35 +0100 Subject: [PATCH 640/989] net: wwan: mhi_wwan_mbim: Silence sequence number glitch errors When using the Qualcomm X55 modem on the ThinkPad X13s, the kernel log is constantly being filled with errors related to a "sequence number glitch", e.g.: [ 1903.284538] sequence number glitch prev=16 curr=0 [ 1913.812205] sequence number glitch prev=50 curr=0 [ 1923.698219] sequence number glitch prev=142 curr=0 [ 2029.248276] sequence number glitch prev=1555 curr=0 [ 2046.333059] sequence number glitch prev=70 curr=0 [ 2076.520067] sequence number glitch prev=272 curr=0 [ 2158.704202] sequence number glitch prev=2655 curr=0 [ 2218.530776] sequence number glitch prev=2349 curr=0 [ 2225.579092] sequence number glitch prev=6 curr=0 Internet connectivity is working fine, so this error seems harmless. It looks like modem does not preserve the sequence number when entering low power state; the amount of errors depends on how actively the modem is being used. A similar issue has also been seen on USB-based MBIM modems [1]. However, in cdc_ncm.c the "sequence number glitch" message is a debug message instead of an error. Apply the same to the mhi_wwan_mbim.c driver to silence these errors when using the modem. [1]: https://lists.freedesktop.org/archives/libmbim-devel/2016-November/000781.html Signed-off-by: Stephan Gerhold Reviewed-by: Loic Poulain Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250212-mhi-wwan-mbim-sequence-glitch-v1-1-503735977cbd@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/wwan/mhi_wwan_mbim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c index d5a9360323d29..8755c5e6a65b3 100644 --- a/drivers/net/wwan/mhi_wwan_mbim.c +++ b/drivers/net/wwan/mhi_wwan_mbim.c @@ -220,7 +220,7 @@ static int mbim_rx_verify_nth16(struct mhi_mbim_context *mbim, struct sk_buff *s if (mbim->rx_seq + 1 != le16_to_cpu(nth16->wSequence) && (mbim->rx_seq || le16_to_cpu(nth16->wSequence)) && !(mbim->rx_seq == 0xffff && !le16_to_cpu(nth16->wSequence))) { - net_err_ratelimited("sequence number glitch prev=%d curr=%d\n", + net_dbg_ratelimited("sequence number glitch prev=%d curr=%d\n", mbim->rx_seq, le16_to_cpu(nth16->wSequence)); } mbim->rx_seq = le16_to_cpu(nth16->wSequence); -- GitLab From 13918315c5dc5a515926c8799042ea6885c2b734 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Sat, 8 Feb 2025 13:42:13 -0700 Subject: [PATCH 641/989] io-wq: backoff when retrying worker creation When io_uring submission goes async for the first time on a given task, we'll try to create a worker thread to handle the submission. Creating this worker thread can fail due to various transient conditions, such as an outstanding signal in the forking thread, so we have retry logic with a limit of 3 retries. However, this retry logic appears to be too aggressive/fast - we've observed a thread blowing through the retry limit while having the same outstanding signal the whole time. Here's an excerpt of some tracing that demonstrates the issue: First, signal 26 is generated for the process. It ends up getting routed to thread 92942. 0) cbd-92284 /* signal_generate: sig=26 errno=0 code=-2 comm=psblkdASD pid=92934 grp=1 res=0 */ This causes create_io_thread in the signalled thread to fail with ERESTARTNOINTR, and thus a retry is queued. 13) task_th-92942 /* io_uring_queue_async_work: ring 000000007325c9ae, request 0000000080c96d8e, user_data 0x0, opcode URING_CMD, flags 0x8240001, normal queue, work 000000006e96dd3f */ 13) task_th-92942 io_wq_enqueue() { 13) task_th-92942 _raw_spin_lock(); 13) task_th-92942 io_wq_activate_free_worker(); 13) task_th-92942 _raw_spin_lock(); 13) task_th-92942 create_io_worker() { 13) task_th-92942 __kmalloc_cache_noprof(); 13) task_th-92942 __init_swait_queue_head(); 13) task_th-92942 kprobe_ftrace_handler() { 13) task_th-92942 get_kprobe(); 13) task_th-92942 aggr_pre_handler() { 13) task_th-92942 pre_handler_kretprobe(); 13) task_th-92942 /* create_enter: (create_io_thread+0x0/0x50) fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */ 13) task_th-92942 } /* aggr_pre_handler */ ... 13) task_th-92942 } /* copy_process */ 13) task_th-92942 } /* create_io_thread */ 13) task_th-92942 kretprobe_rethook_handler() { 13) task_th-92942 /* create_exit: (create_io_worker+0x8a/0x1a0 <- create_io_thread) arg1=0xfffffffffffffdff */ 13) task_th-92942 } /* kretprobe_rethook_handler */ 13) task_th-92942 queue_work_on() { ... The CPU is then handed to a kworker to process the queued retry: ------------------------------------------ 13) task_th-92942 => kworker-54154 ------------------------------------------ 13) kworker-54154 io_workqueue_create() { 13) kworker-54154 io_queue_worker_create() { 13) kworker-54154 task_work_add() { 13) kworker-54154 wake_up_state() { 13) kworker-54154 try_to_wake_up() { 13) kworker-54154 _raw_spin_lock_irqsave(); 13) kworker-54154 _raw_spin_unlock_irqrestore(); 13) kworker-54154 } /* try_to_wake_up */ 13) kworker-54154 } /* wake_up_state */ 13) kworker-54154 kick_process(); 13) kworker-54154 } /* task_work_add */ 13) kworker-54154 } /* io_queue_worker_create */ 13) kworker-54154 } /* io_workqueue_create */ And then we immediately switch back to the original task to try creating a worker again. This fails, because the original task still hasn't handled its signal. ----------------------------------------- 13) kworker-54154 => task_th-92942 ------------------------------------------ 13) task_th-92942 create_worker_cont() { 13) task_th-92942 kprobe_ftrace_handler() { 13) task_th-92942 get_kprobe(); 13) task_th-92942 aggr_pre_handler() { 13) task_th-92942 pre_handler_kretprobe(); 13) task_th-92942 /* create_enter: (create_io_thread+0x0/0x50) fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */ 13) task_th-92942 } /* aggr_pre_handler */ 13) task_th-92942 } /* kprobe_ftrace_handler */ 13) task_th-92942 create_io_thread() { 13) task_th-92942 copy_process() { 13) task_th-92942 task_active_pid_ns(); 13) task_th-92942 _raw_spin_lock_irq(); 13) task_th-92942 recalc_sigpending(); 13) task_th-92942 _raw_spin_lock_irq(); 13) task_th-92942 } /* copy_process */ 13) task_th-92942 } /* create_io_thread */ 13) task_th-92942 kretprobe_rethook_handler() { 13) task_th-92942 /* create_exit: (create_worker_cont+0x35/0x1b0 <- create_io_thread) arg1=0xfffffffffffffdff */ 13) task_th-92942 } /* kretprobe_rethook_handler */ 13) task_th-92942 io_worker_release(); 13) task_th-92942 queue_work_on() { 13) task_th-92942 clear_pending_if_disabled(); 13) task_th-92942 __queue_work() { 13) task_th-92942 } /* __queue_work */ 13) task_th-92942 } /* queue_work_on */ 13) task_th-92942 } /* create_worker_cont */ The pattern repeats another couple times until we blow through the retry counter, at which point we give up. All outstanding work is canceled, and the io_uring command which triggered all this is failed with ECANCELED: 13) task_th-92942 io_acct_cancel_pending_work() { ... 13) task_th-92942 /* io_uring_complete: ring 000000007325c9ae, req 0000000080c96d8e, user_data 0x0, result -125, cflags 0x0 extra1 0 extra2 0 */ Finally, the task gets around to processing its outstanding signal 26, but it's too late. 13) task_th-92942 /* signal_deliver: sig=26 errno=0 code=-2 sa_handler=59566a0 sa_flags=14000000 */ Try to address this issue by adding a small scaling delay when retrying worker creation. This should give the forking thread time to handle its signal in the above case. This isn't a particularly satisfying solution, as sufficiently paradoxical scheduling would still have us hitting the same issue, and I'm open to suggestions for something better. But this is likely to prevent this (already rare) issue from hitting in practice. Signed-off-by: Uday Shankar Link: https://lore.kernel.org/r/20250208-wq_retry-v2-1-4f6f5041d303@purestorage.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 5d0928f37471e..91019b4d03088 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -64,7 +64,7 @@ struct io_worker { union { struct rcu_head rcu; - struct work_struct work; + struct delayed_work work; }; }; @@ -770,6 +770,18 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err) } } +static void queue_create_worker_retry(struct io_worker *worker) +{ + /* + * We only bother retrying because there's a chance that the + * failure to create a worker is due to some temporary condition + * in the forking task (e.g. outstanding signal); give the task + * some time to clear that condition. + */ + schedule_delayed_work(&worker->work, + msecs_to_jiffies(worker->init_retries * 5)); +} + static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; @@ -809,12 +821,13 @@ static void create_worker_cont(struct callback_head *cb) /* re-create attempts grab a new worker ref, drop the existing one */ io_worker_release(worker); - schedule_work(&worker->work); + queue_create_worker_retry(worker); } static void io_workqueue_create(struct work_struct *work) { - struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_worker *worker = container_of(work, struct io_worker, + work.work); struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) @@ -855,8 +868,8 @@ static bool create_io_worker(struct io_wq *wq, int index) kfree(worker); goto fail; } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); + INIT_DELAYED_WORK(&worker->work, io_workqueue_create); + queue_create_worker_retry(worker); } return true; -- GitLab From 43c70b104093c324b1a000762ce943d16ce788f9 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 14 Feb 2025 12:36:36 -0700 Subject: [PATCH 642/989] block/merge: remove unnecessary min() with UINT_MAX In bvec_split_segs(), max_bytes is an unsigned, so it must be less than or equal to UINT_MAX. Remove the unnecessary min(). Prior to commit 67927d220150 ("block/merge: count bytes instead of sectors"), the min() was with UINT_MAX >> 9, so it did have an effect. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250214193637.234702-1-csander@purestorage.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 15cd231d560cb..39b738c0e4c9a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -270,7 +270,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, unsigned max_segs, unsigned max_bytes) { - unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; + unsigned max_len = max_bytes - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; -- GitLab From 435b344a7042e91fb4719d589f18310e8919e39f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:53:47 +0000 Subject: [PATCH 643/989] crypto: ccp: Add external API interface for PSP module initialization KVM is dependent on the PSP SEV driver and PSP SEV driver needs to be loaded before KVM module. In case of module loading any dependent modules are automatically loaded but in case of built-in modules there is no inherent mechanism available to specify dependencies between modules and ensure that any dependent modules are loaded implicitly. Add a new external API interface for PSP module initialization which allows PSP SEV driver to be loaded explicitly if KVM is built-in. Signed-off-by: Sean Christopherson Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Message-ID: <15279ca0cad56a07cf12834ec544310f85ff5edc.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- drivers/crypto/ccp/sp-dev.c | 14 ++++++++++++++ include/linux/psp-sev.h | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c index 7eb3e46682860..3467f6db4f505 100644 --- a/drivers/crypto/ccp/sp-dev.c +++ b/drivers/crypto/ccp/sp-dev.c @@ -19,6 +19,7 @@ #include #include +#include "sev-dev.h" #include "ccp-dev.h" #include "sp-dev.h" @@ -253,8 +254,12 @@ struct sp_device *sp_get_psp_master_device(void) static int __init sp_mod_init(void) { #ifdef CONFIG_X86 + static bool initialized; int ret; + if (initialized) + return 0; + ret = sp_pci_init(); if (ret) return ret; @@ -263,6 +268,8 @@ static int __init sp_mod_init(void) psp_pci_init(); #endif + initialized = true; + return 0; #endif @@ -279,6 +286,13 @@ static int __init sp_mod_init(void) return -ENODEV; } +#if IS_BUILTIN(CONFIG_KVM_AMD) && IS_ENABLED(CONFIG_KVM_AMD_SEV) +int __init sev_module_init(void) +{ + return sp_mod_init(); +} +#endif + static void __exit sp_mod_exit(void) { #ifdef CONFIG_X86 diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 903ddfea85850..f3cad182d4ef6 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -814,6 +814,15 @@ struct sev_data_snp_commit { #ifdef CONFIG_CRYPTO_DEV_SP_PSP +/** + * sev_module_init - perform PSP SEV module initialization + * + * Returns: + * 0 if the PSP module is successfully initialized + * negative value if the PSP module initialization fails + */ +int sev_module_init(void); + /** * sev_platform_init - perform SEV INIT command * -- GitLab From 44e70718df4fc2fadf1665eb9374df71aeda1f03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:54:02 +0000 Subject: [PATCH 644/989] KVM: SVM: Ensure PSP module is initialized if KVM module is built-in The kernel's initcall infrastructure lacks the ability to express dependencies between initcalls, whereas the modules infrastructure automatically handles dependencies via symbol loading. Ensure the PSP SEV driver is initialized before proceeding in sev_hardware_setup() if KVM is built-in as the dependency isn't handled by the initcall infrastructure. Signed-off-by: Sean Christopherson Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Message-ID: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c320503..0dbb25442ec14 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2972,6 +2972,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); -- GitLab From 409f45387c937145adeeeebc6d6032c2ec232b35 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 10 Feb 2025 22:54:18 +0000 Subject: [PATCH 645/989] x86/sev: Fix broken SNP support with KVM module built-in Fix issues with enabling SNP host support and effectively SNP support which is broken with respect to the KVM module being built-in. SNP host support is enabled in snp_rmptable_init() which is invoked as device_initcall(). SNP check on IOMMU is done during IOMMU PCI init (IOMMU_PCI_INIT stage). And for that reason snp_rmptable_init() is currently invoked via device_initcall() and cannot be invoked via subsys_initcall() as core IOMMU subsystem gets initialized via subsys_initcall(). Now, if kvm_amd module is built-in, it gets initialized before SNP host support is enabled in snp_rmptable_init() : [ 10.131811] kvm_amd: TSC scaling supported [ 10.136384] kvm_amd: Nested Virtualization enabled [ 10.141734] kvm_amd: Nested Paging enabled [ 10.146304] kvm_amd: LBR virtualization supported [ 10.151557] kvm_amd: SEV enabled (ASIDs 100 - 509) [ 10.156905] kvm_amd: SEV-ES enabled (ASIDs 1 - 99) [ 10.162256] kvm_amd: SEV-SNP enabled (ASIDs 1 - 99) [ 10.171508] kvm_amd: Virtual VMLOAD VMSAVE supported [ 10.177052] kvm_amd: Virtual GIF supported ... ... [ 10.201648] kvm_amd: in svm_enable_virtualization_cpu And then svm_x86_ops->enable_virtualization_cpu() (svm_enable_virtualization_cpu) programs MSR_VM_HSAVE_PA as following: wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); So VM_HSAVE_PA is non-zero before SNP support is enabled on all CPUs. snp_rmptable_init() gets invoked after svm_enable_virtualization_cpu() as following : ... [ 11.256138] kvm_amd: in svm_enable_virtualization_cpu ... [ 11.264918] SEV-SNP: in snp_rmptable_init This triggers a #GP exception in snp_rmptable_init() when snp_enable() is invoked to set SNP_EN in SYSCFG MSR: [ 11.294289] unchecked MSR access error: WRMSR to 0xc0010010 (tried to write 0x0000000003fc0000) at rIP: 0xffffffffaf5d5c28 (native_write_msr+0x8/0x30) ... [ 11.294404] Call Trace: [ 11.294482] [ 11.294513] ? show_stack_regs+0x26/0x30 [ 11.294522] ? ex_handler_msr+0x10f/0x180 [ 11.294529] ? search_extable+0x2b/0x40 [ 11.294538] ? fixup_exception+0x2dd/0x340 [ 11.294542] ? exc_general_protection+0x14f/0x440 [ 11.294550] ? asm_exc_general_protection+0x2b/0x30 [ 11.294557] ? __pfx_snp_enable+0x10/0x10 [ 11.294567] ? native_write_msr+0x8/0x30 [ 11.294570] ? __snp_enable+0x5d/0x70 [ 11.294575] snp_enable+0x19/0x20 [ 11.294578] __flush_smp_call_function_queue+0x9c/0x3a0 [ 11.294586] generic_smp_call_function_single_interrupt+0x17/0x20 [ 11.294589] __sysvec_call_function+0x20/0x90 [ 11.294596] sysvec_call_function+0x80/0xb0 [ 11.294601] [ 11.294603] [ 11.294605] asm_sysvec_call_function+0x1f/0x30 ... [ 11.294631] arch_cpu_idle+0xd/0x20 [ 11.294633] default_idle_call+0x34/0xd0 [ 11.294636] do_idle+0x1f1/0x230 [ 11.294643] ? complete+0x71/0x80 [ 11.294649] cpu_startup_entry+0x30/0x40 [ 11.294652] start_secondary+0x12d/0x160 [ 11.294655] common_startup_64+0x13e/0x141 [ 11.294662] This #GP exception is getting triggered due to the following errata for AMD family 19h Models 10h-1Fh Processors: Processor may generate spurious #GP(0) Exception on WRMSR instruction: Description: The Processor will generate a spurious #GP(0) Exception on a WRMSR instruction if the following conditions are all met: - the target of the WRMSR is a SYSCFG register. - the write changes the value of SYSCFG.SNPEn from 0 to 1. - One of the threads that share the physical core has a non-zero value in the VM_HSAVE_PA MSR. The document being referred to above: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/revision-guides/57095-PUB_1_01.pdf To summarize, with kvm_amd module being built-in, KVM/SVM initialization happens before host SNP is enabled and this SVM initialization sets VM_HSAVE_PA to non-zero, which then triggers a #GP when SYSCFG.SNPEn is being set and this will subsequently cause SNP_INIT(_EX) to fail with INVALID_CONFIG error as SYSCFG[SnpEn] is not set on all CPUs. Essentially SNP host enabling code should be invoked before KVM initialization, which is currently not the case when KVM is built-in. Add fix to call snp_rmptable_init() early from iommu_snp_enable() directly and not invoked via device_initcall() which enables SNP host support before KVM initialization with kvm_amd module built-in. Add additional handling for `iommu=off` or `amd_iommu=off` options. Note that IOMMUs need to be enabled for SNP initialization, therefore, if host SNP support is enabled but late IOMMU initialization fails then that will cause PSP driver's SNP_INIT to fail as IOMMU SNP sanity checks in SNP firmware will fail with invalid configuration error as below: [ 9.723114] ccp 0000:23:00.1: sev enabled [ 9.727602] ccp 0000:23:00.1: psp enabled [ 9.732527] ccp 0000:a2:00.1: enabling device (0000 -> 0002) [ 9.739098] ccp 0000:a2:00.1: no command queues available [ 9.745167] ccp 0000:a2:00.1: psp enabled [ 9.805337] ccp 0000:23:00.1: SEV-SNP: failed to INIT rc -5, error 0x3 [ 9.866426] ccp 0000:23:00.1: SEV API:1.53 build:5 Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature") Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Co-developed-by: Vasant Hegde Signed-off-by: Vasant Hegde Cc: Signed-off-by: Ashish Kalra Acked-by: Joerg Roedel Message-ID: <138b520fb83964782303b43ade4369cd181fdd9c.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/sev.h | 2 ++ arch/x86/virt/svm/sev.c | 23 +++++++---------------- drivers/iommu/amd/init.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 5d9685f92e5c3..1581246491b54 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -531,6 +531,7 @@ static inline void __init snp_secure_tsc_init(void) { } #ifdef CONFIG_KVM_AMD_SEV bool snp_probe_rmptable_info(void); +int snp_rmptable_init(void); int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); void snp_dump_hva_rmpentry(unsigned long address); int psmash(u64 pfn); @@ -541,6 +542,7 @@ void kdump_sev_callback(void); void snp_fixup_e820_tables(void); #else static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_rmptable_init(void) { return -ENOSYS; } static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } static inline void snp_dump_hva_rmpentry(unsigned long address) {} static inline int psmash(u64 pfn) { return -ENODEV; } diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c index 1dcc027ec77e7..42e74a5a7d786 100644 --- a/arch/x86/virt/svm/sev.c +++ b/arch/x86/virt/svm/sev.c @@ -505,19 +505,19 @@ static bool __init setup_rmptable(void) * described in the SNP_INIT_EX firmware command description in the SNP * firmware ABI spec. */ -static int __init snp_rmptable_init(void) +int __init snp_rmptable_init(void) { unsigned int i; u64 val; - if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) - return 0; + if (WARN_ON_ONCE(!cc_platform_has(CC_ATTR_HOST_SEV_SNP))) + return -ENOSYS; - if (!amd_iommu_snp_en) - goto nosnp; + if (WARN_ON_ONCE(!amd_iommu_snp_en)) + return -ENOSYS; if (!setup_rmptable()) - goto nosnp; + return -ENOSYS; /* * Check if SEV-SNP is already enabled, this can happen in case of @@ -530,7 +530,7 @@ static int __init snp_rmptable_init(void) /* Zero out the RMP bookkeeping area */ if (!clear_rmptable_bookkeeping()) { free_rmp_segment_table(); - goto nosnp; + return -ENOSYS; } /* Zero out the RMP entries */ @@ -562,17 +562,8 @@ static int __init snp_rmptable_init(void) crash_kexec_post_notifiers = true; return 0; - -nosnp: - cc_platform_clear(CC_ATTR_HOST_SEV_SNP); - return -ENOSYS; } -/* - * This must be called after the IOMMU has been initialized. - */ -device_initcall(snp_rmptable_init); - static void set_rmp_segment_info(unsigned int segment_shift) { rmp_segment_shift = segment_shift; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c5cd92edada06..2fecfed75e543 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3194,7 +3194,7 @@ static bool __init detect_ivrs(void) return true; } -static void iommu_snp_enable(void) +static __init void iommu_snp_enable(void) { #ifdef CONFIG_KVM_AMD_SEV if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) @@ -3219,6 +3219,14 @@ static void iommu_snp_enable(void) goto disable_snp; } + /* + * Enable host SNP support once SNP support is checked on IOMMU. + */ + if (snp_rmptable_init()) { + pr_warn("SNP: RMP initialization failed, SNP cannot be supported.\n"); + goto disable_snp; + } + pr_info("IOMMU SNP support enabled.\n"); return; @@ -3318,6 +3326,19 @@ static int __init iommu_go_to_state(enum iommu_init_state state) ret = state_next(); } + /* + * SNP platform initilazation requires IOMMUs to be fully configured. + * If the SNP support on IOMMUs has NOT been checked, simply mark SNP + * as unsupported. If the SNP support on IOMMUs has been checked and + * host SNP support enabled but RMP enforcement has not been enabled + * in IOMMUs, then the system is in a half-baked state, but can limp + * along as all memory should be Hypervisor-Owned in the RMP. WARN, + * but leave SNP as "supported" to avoid confusing the kernel. + */ + if (ret && cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !WARN_ON_ONCE(amd_iommu_snp_en)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + return ret; } @@ -3426,18 +3447,23 @@ void __init amd_iommu_detect(void) int ret; if (no_iommu || (iommu_detected && !gart_iommu_aperture)) - return; + goto disable_snp; if (!amd_iommu_sme_check()) - return; + goto disable_snp; ret = iommu_go_to_state(IOMMU_IVRS_DETECTED); if (ret) - return; + goto disable_snp; amd_iommu_detected = true; iommu_detected = 1; x86_init.iommu.iommu_init = amd_iommu_init; + return; + +disable_snp: + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); } /**************************************************************************** -- GitLab From 55eff109e76a14e5ed10c8c3c3978d20a35e2a4d Mon Sep 17 00:00:00 2001 From: Junnan Wu Date: Fri, 14 Feb 2025 09:22:00 +0800 Subject: [PATCH 646/989] vsock/virtio: fix variables initialization during resuming When executing suspend to ram twice in a row, the `rx_buf_nr` and `rx_buf_max_nr` increase to three times vq->num_free. Then after virtqueue_get_buf and `rx_buf_nr` decreased in function virtio_transport_rx_work, the condition to fill rx buffer (rx_buf_nr < rx_buf_max_nr / 2) will never be met. It is because that `rx_buf_nr` and `rx_buf_max_nr` are initialized only in virtio_vsock_probe(), but they should be reset whenever virtqueues are recreated, like after a suspend/resume. Move the `rx_buf_nr` and `rx_buf_max_nr` initialization in virtio_vsock_vqs_init(), so we are sure that they are properly initialized, every time we initialize the virtqueues, either when we load the driver or after a suspend/resume. To prevent erroneous atomic load operations on the `queued_replies` in the virtio_transport_send_pkt_work() function which may disrupt the scheduling of vsock->rx_work when transmitting reply-required socket packets, this atomic variable must undergo synchronized initialization alongside the preceding two variables after a suspend/resume. Fixes: bd50c5dc182b ("vsock/virtio: add support for device suspend/resume") Link: https://lore.kernel.org/virtualization/20250207052033.2222629-1-junnan01.wu@samsung.com/ Co-developed-by: Ying Gao Signed-off-by: Ying Gao Signed-off-by: Junnan Wu Reviewed-by: Luigi Leonardi Acked-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20250214012200.1883896-1-junnan01.wu@samsung.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b58c3818f284f..f0e48e6911fc4 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -670,6 +670,13 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) }; int ret; + mutex_lock(&vsock->rx_lock); + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + mutex_unlock(&vsock->rx_lock); + + atomic_set(&vsock->queued_replies, 0); + ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; @@ -779,9 +786,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - atomic_set(&vsock->queued_replies, 0); mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); -- GitLab From 9593172d93b9f91c362baec4643003dc29802929 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Feb 2025 13:33:54 +0900 Subject: [PATCH 647/989] geneve: Fix use-after-free in geneve_find_dev(). syzkaller reported a use-after-free in geneve_find_dev() [0] without repro. geneve_configure() links struct geneve_dev.next to net_generic(net, geneve_net_id)->geneve_list. The net here could differ from dev_net(dev) if IFLA_NET_NS_PID, IFLA_NET_NS_FD, or IFLA_TARGET_NETNSID is set. When dev_net(dev) is dismantled, geneve_exit_batch_rtnl() finally calls unregister_netdevice_queue() for each dev in the netns, and later the dev is freed. However, its geneve_dev.next is still linked to the backend UDP socket netns. Then, use-after-free will occur when another geneve dev is created in the netns. Let's call geneve_dellink() instead in geneve_destroy_tunnels(). [0]: BUG: KASAN: slab-use-after-free in geneve_find_dev drivers/net/geneve.c:1295 [inline] BUG: KASAN: slab-use-after-free in geneve_configure+0x234/0x858 drivers/net/geneve.c:1343 Read of size 2 at addr ffff000054d6ee24 by task syz.1.4029/13441 CPU: 1 UID: 0 PID: 13441 Comm: syz.1.4029 Not tainted 6.13.0-g0ad9617c78ac #24 dc35ca22c79fb82e8e7bc5c9c9adafea898b1e3d Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:466 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load2_noabort+0x20/0x30 mm/kasan/report_generic.c:379 geneve_find_dev drivers/net/geneve.c:1295 [inline] geneve_configure+0x234/0x858 drivers/net/geneve.c:1343 geneve_newlink+0xb8/0x128 drivers/net/geneve.c:1634 rtnl_newlink_create+0x23c/0x868 net/core/rtnetlink.c:3795 __rtnl_newlink net/core/rtnetlink.c:3906 [inline] rtnl_newlink+0x1054/0x1630 net/core/rtnetlink.c:4021 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6911 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2543 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6938 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:713 [inline] __sock_sendmsg net/socket.c:728 [inline] ____sys_sendmsg+0x410/0x6f8 net/socket.c:2568 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2622 __sys_sendmsg net/socket.c:2654 [inline] __do_sys_sendmsg net/socket.c:2659 [inline] __se_sys_sendmsg net/socket.c:2657 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2657 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Allocated by task 13247: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4298 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4304 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:645 alloc_netdev_mqs+0xb8/0x11a0 net/core/dev.c:11470 rtnl_create_link+0x2b8/0xb50 net/core/rtnetlink.c:3604 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3780 __rtnl_newlink net/core/rtnetlink.c:3906 [inline] rtnl_newlink+0x1054/0x1630 net/core/rtnetlink.c:4021 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6911 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2543 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6938 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:713 [inline] __sock_sendmsg net/socket.c:728 [inline] ____sys_sendmsg+0x410/0x6f8 net/socket.c:2568 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2622 __sys_sendmsg net/socket.c:2654 [inline] __do_sys_sendmsg net/socket.c:2659 [inline] __se_sys_sendmsg net/socket.c:2657 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2657 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 45: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2353 [inline] slab_free mm/slub.c:4613 [inline] kfree+0x140/0x420 mm/slub.c:4761 kvfree+0x4c/0x68 mm/util.c:688 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2065 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xe5c/0xfc8 net/core/dev.c:11185 rtnl_unlock+0x20/0x38 net/core/rtnetlink.c:151 cleanup_net+0x4fc/0x8c0 net/core/net_namespace.c:648 process_one_work+0x700/0x1398 kernel/workqueue.c:3236 process_scheduled_works kernel/workqueue.c:3317 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3398 kthread+0x4bc/0x608 kernel/kthread.c:464 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 The buggy address belongs to the object at ffff000054d6e000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 3620 bytes inside of freed 4096-byte region [ffff000054d6e000, ffff000054d6f000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x94d68 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff000016276181 flags: 0x3fffe0000000040(head|node=0|zone=0|lastcpupid=0x1ffff) page_type: f5(slab) raw: 03fffe0000000040 ffff0000c000f500 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff000016276181 head: 03fffe0000000040 ffff0000c000f500 dead000000000122 0000000000000000 head: 0000000000000000 0000000000040004 00000001f5000000 ffff000016276181 head: 03fffe0000000003 fffffdffc1535a01 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff000054d6ed00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff000054d6ed80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff000054d6ee00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff000054d6ee80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff000054d6ef00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 2d07dc79fe04 ("geneve: add initial netdev driver for GENEVE tunnels") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250213043354.91368-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 642155cb83157..a1f674539965d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1907,16 +1907,11 @@ static void geneve_destroy_tunnels(struct net *net, struct list_head *head) /* gather any geneve devices that were moved into this ns */ for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &geneve_link_ops) - unregister_netdevice_queue(dev, head); + geneve_dellink(dev, head); /* now gather any other geneve devices that were created in this ns */ - list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) { - /* If geneve->dev is in the same netns, it was already added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(geneve->dev), net)) - unregister_netdevice_queue(geneve->dev, head); - } + list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) + geneve_dellink(geneve->dev, head); } static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list, -- GitLab From d1d0963121769d8d16150b913fe886e48efefa51 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Feb 2025 09:29:06 +0900 Subject: [PATCH 648/989] tools: fix annoying "mkdir -p ..." logs when building tools in parallel When CONFIG_OBJTOOL=y or CONFIG_DEBUG_INFO_BTF=y, parallel builds show awkward "mkdir -p ..." logs. $ make -j16 [ snip ] mkdir -p /home/masahiro/ref/linux/tools/objtool && make O=/home/masahiro/ref/linux subdir=tools/objtool --no-print-directory -C objtool mkdir -p /home/masahiro/ref/linux/tools/bpf/resolve_btfids && make O=/home/masahiro/ref/linux subdir=tools/bpf/resolve_btfids --no-print-directory -C bpf/resolve_btfids Defining MAKEFLAGS= on the command line wipes out command line switches from the resultant MAKEFLAGS definition, even though the command line switches are active. [1] MAKEFLAGS puts all single-letter options into the first word, and that word will be empty if no single-letter options were given. [2] However, this breaks if MAKEFLAGS= is given on the command line. The tools/ and tools/% targets set MAKEFLAGS= on the command line, which breaks the following code in tools/scripts/Makefile.include: short-opts := $(firstword -$(MAKEFLAGS)) If MAKEFLAGS really needs modification, it should be done through the environment variable, as follows: MAKEFLAGS= $(MAKE) ... That said, I question whether modifying MAKEFLAGS is necessary here. The only flag we might want to exclude is --no-print-directory, as the tools build system changes the working directory. However, people might find the "Entering/Leaving directory" logs annoying. I simply removed the offending MAKEFLAGS=. [1]: https://savannah.gnu.org/bugs/?62469 [2]: https://www.gnu.org/software/make/manual/make.html#Testing-Flags Fixes: ea01fa9f63ae ("tools: Connect to the kernel build system") Fixes: a50e43332756 ("perf tools: Honor parallel jobs") Signed-off-by: Masahiro Yamada Tested-by: Daniel Xu --- Makefile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 89628e354ca7b..52207bcb1a9df 100644 --- a/Makefile +++ b/Makefile @@ -1421,18 +1421,13 @@ ifneq ($(wildcard $(resolve_btfids_O)),) $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean endif -# Clear a bunch of variables before executing the submake -ifeq ($(quiet),silent_) -tools_silent=s -endif - tools/: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ + $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ tools/%: FORCE $(Q)mkdir -p $(objtree)/tools - $(Q)$(MAKE) LDFLAGS= MAKEFLAGS="$(tools_silent) $(filter --j% -j,$(MAKEFLAGS))" O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* + $(Q)$(MAKE) LDFLAGS= O=$(abspath $(objtree)) subdir=tools -C $(srctree)/tools/ $* # --------------------------------------------------------------------------- # Kernel selftest -- GitLab From 140332b6ed727a4ec2e5722a1ccda28b52d45771 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Feb 2025 15:26:44 +0900 Subject: [PATCH 649/989] kbuild: fix linux-headers package build when $(CC) cannot link userspace Since commit 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package"), the linux-headers Debian package fails to build when $(CC) cannot build userspace applications, for example, when using toolchains installed by the 0day bot. The host programs in the linux-headers package should be rebuilt using the disto's cross-compiler, ${DEB_HOST_GNU_TYPE}-gcc instead of $(CC). Hence, the variable 'CC' must be expanded in this shell script instead of in the top-level Makefile. Commit f354fc88a72a ("kbuild: install-extmod-build: add missing quotation marks for CC variable") was not a correct fix because CC="ccache gcc" should be unrelated when rebuilding userspace tools. Fixes: 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package") Reported-by: Jeff Johnson Closes: https://lore.kernel.org/linux-kbuild/CAK7LNARb3xO3ptBWOMpwKcyf3=zkfhMey5H2KnB1dOmUwM79dA@mail.gmail.com/T/#t Signed-off-by: Masahiro Yamada Tested-by: Jeff Johnson --- scripts/package/install-extmod-build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index b724626ea0ca0..2966473b46609 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -62,8 +62,8 @@ if [ "${CC}" != "${HOSTCC}" ]; then # # Clear VPATH and srcroot because the source files reside in the output # directory. - # shellcheck disable=SC2016 # $(MAKE), $(CC), and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC="$(CC)" VPATH= srcroot=. $(build)='"${destdir}"/scripts + # shellcheck disable=SC2016 # $(MAKE) and $(build) will be expanded by Make + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC='"${CC}"' VPATH= srcroot=. $(build)='"${destdir}"/scripts rm -f "${destdir}/scripts/Kbuild" fi -- GitLab From 1e988c3fe1264708f4f92109203ac5b1d65de50b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Feb 2025 22:48:15 +0000 Subject: [PATCH 650/989] io_uring: prevent opcode speculation sqe->opcode is used for different tables, make sure we santitise it against speculations. Cc: stable@vger.kernel.org Fixes: d3656344fea03 ("io_uring: add lookup table for various opcode needs") Signed-off-by: Pavel Begunkov Reviewed-by: Li Zetao Link: https://lore.kernel.org/r/7eddbf31c8ca0a3947f8ed98271acc2b4349c016.1739568408.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 263e504be4a8b..29a42365a4816 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2045,6 +2045,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return io_init_fail_req(req, -EINVAL); } + opcode = array_index_nospec(opcode, IORING_OP_LAST); + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ -- GitLab From 071ed42cff4fcdd89025d966d48eabef59913bf2 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 13 Feb 2025 23:36:10 +0100 Subject: [PATCH 651/989] net/sched: cls_api: fix error handling causing NULL dereference tcf_exts_miss_cookie_base_alloc() calls xa_alloc_cyclic() which can return 1 if the allocation succeeded after wrapping. This was treated as an error, with value 1 returned to caller tcf_exts_init_ex() which sets exts->actions to NULL and returns 1 to caller fl_change(). fl_change() treats err == 1 as success, calling tcf_exts_validate_ex() which calls tcf_action_init() with exts->actions as argument, where it is dereferenced. Example trace: BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 114 PID: 16151 Comm: handler114 Kdump: loaded Not tainted 5.14.0-503.16.1.el9_5.x86_64 #1 RIP: 0010:tcf_action_init+0x1f8/0x2c0 Call Trace: tcf_action_init+0x1f8/0x2c0 tcf_exts_validate_ex+0x175/0x190 fl_change+0x537/0x1120 [cls_flower] Fixes: 80cd22c35c90 ("net/sched: cls_api: Support hardware miss to tc action") Signed-off-by: Pierre Riteau Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250213223610.320278-1-pierre@stackhpc.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8e47e5355be61..4f648af8cfaaf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; -- GitLab From ef24989a62eefa6293a6c1c59dbb8b7e646e76dd Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 13 Feb 2025 17:44:43 +0100 Subject: [PATCH 652/989] drm/msm/a6xx: Only print the GMU firmware version once We only fetch it once from userland, so let's also only notify the user once and not on every runtime resume. As you can notice by the tags chain, more than one user found this annoying. Reported-by: Jens Glathe Suggested-by: Abel Vesa Suggested-by: Rob Clark Signed-off-by: Konrad Dybcio Reviewed-by: Neil Armstrong Patchwork: https://patchwork.freedesktop.org/patch/637062/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 65d38b25c0707..699b0dd34b18f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -813,10 +813,10 @@ static int a6xx_gmu_fw_load(struct a6xx_gmu *gmu) } ver = gmu_read(gmu, REG_A6XX_GMU_CORE_FW_VERSION); - DRM_INFO("Loaded GMU firmware v%u.%u.%u\n", - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MAJOR__MASK, ver), - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MINOR__MASK, ver), - FIELD_GET(A6XX_GMU_CORE_FW_VERSION_STEP__MASK, ver)); + DRM_INFO_ONCE("Loaded GMU firmware v%u.%u.%u\n", + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MAJOR__MASK, ver), + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_MINOR__MASK, ver), + FIELD_GET(A6XX_GMU_CORE_FW_VERSION_STEP__MASK, ver)); return 0; } -- GitLab From 669c285620231786fffe9d87ab432e08a6ed922b Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 13 Jan 2025 07:48:41 -0800 Subject: [PATCH 653/989] drm/msm: Avoid rounding up to one jiffy If userspace is trying to achieve a timeout of zero, let 'em have it. Only round up if the timeout is greater than zero. Fixes: 4969bccd5f4e ("drm/msm: Avoid rounding down to zero jiffies") Signed-off-by: Rob Clark Reviewed-by: Akhil P Oommen Patchwork: https://patchwork.freedesktop.org/patch/632264/ --- drivers/gpu/drm/msm/msm_drv.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index fee31680a6d54..a650778552017 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -537,15 +537,12 @@ static inline int align_pitch(int width, int bpp) static inline unsigned long timeout_to_jiffies(const ktime_t *timeout) { ktime_t now = ktime_get(); - s64 remaining_jiffies; - if (ktime_compare(*timeout, now) < 0) { - remaining_jiffies = 0; - } else { - ktime_t rem = ktime_sub(*timeout, now); - remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); - } + if (ktime_compare(*timeout, now) <= 0) + return 0; + ktime_t rem = ktime_sub(*timeout, now); + s64 remaining_jiffies = ktime_divns(rem, NSEC_PER_SEC / HZ); return clamp(remaining_jiffies, 1LL, (s64)INT_MAX); } -- GitLab From d440148418f4816b4973ec6723bf63821793a0a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Feb 2025 09:28:55 -0800 Subject: [PATCH 654/989] tegra210-adma: fix 32-bit x86 build The Tegra210 Audio DMA controller driver did a plain divide: page_no = (res_page->start - res_base->start) / cdata->ch_base_offset; which causes problems on 32-bit x86 configurations that have 64-bit resource sizes: x86_64-linux-ld: drivers/dma/tegra210-adma.o: in function `tegra_adma_probe': tegra210-adma.c:(.text+0x1322): undefined reference to `__udivdi3' because gcc doesn't generate the trivial code for a 64-by-32 divide, turning it into a function call to do a full 64-by-64 divide. And the kernel intentionally doesn't provide that helper function, because 99% of the time all you want is the narrower version. Of course, tegra210 is a 64-bit architecture and the 32-bit x86 build is purely for build testing, so this really is just about build coverage failure. But build coverage is good. Side note: div_u64() would be suboptimal if you actually have a 32-bit resource_t, so our "helper" for divides are admittedly making it harder than it should be to generate good code for all the possible cases. At some point, I'll consider 32-bit x86 so entirely legacy that I can't find it in myself to care any more, and we'll just add the __udivdi3 library function. But for now, the right thing to do is to use "div_u64()" to show that you know that you are doing the simpler divide with a 32-bit number. And the build error enforces that. While fixing the build issue, also check for division-by-zero, and for overflow. Which hopefully cannot happen on real production hardware, but the value of 'ch_base_offset' can definitely be zero in other places. Reported-by: Guenter Roeck Signed-off-by: Linus Torvalds --- drivers/dma/tegra210-adma.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index 6896da8ac7ef6..5c6a5b3589873 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -887,7 +887,7 @@ static int tegra_adma_probe(struct platform_device *pdev) const struct tegra_adma_chip_data *cdata; struct tegra_adma *tdma; struct resource *res_page, *res_base; - int ret, i, page_no; + int ret, i; cdata = of_device_get_match_data(&pdev->dev); if (!cdata) { @@ -914,9 +914,20 @@ static int tegra_adma_probe(struct platform_device *pdev) res_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "global"); if (res_base) { - page_no = (res_page->start - res_base->start) / cdata->ch_base_offset; - if (page_no <= 0) + resource_size_t page_offset, page_no; + unsigned int ch_base_offset; + + if (res_page->start < res_base->start) + return -EINVAL; + page_offset = res_page->start - res_base->start; + ch_base_offset = cdata->ch_base_offset; + if (!ch_base_offset) return -EINVAL; + + page_no = div_u64(page_offset, ch_base_offset); + if (!page_no || page_no > INT_MAX) + return -EINVAL; + tdma->ch_page_no = page_no - 1; tdma->base_addr = devm_ioremap_resource(&pdev->dev, res_base); if (IS_ERR(tdma->base_addr)) -- GitLab From 1b71c2fb04e7a713abc6edde4a412416ff3158f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 13 Feb 2025 15:55:17 +0100 Subject: [PATCH 655/989] kbuild: userprogs: fix bitsize and target detection on clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/Makefile.clang was changed in the linked commit to move --target from KBUILD_CFLAGS to KBUILD_CPPFLAGS, as that generally has a broader scope. However that variable is not inspected by the userprogs logic, breaking cross compilation on clang. Use both variables to detect bitsize and target arguments for userprogs. Fixes: feb843a469fb ("kbuild: add $(CLANG_FLAGS) to KBUILD_CPPFLAGS") Cc: stable@vger.kernel.org Signed-off-by: Thomas Weißschuh Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 52207bcb1a9df..272db408be5ca 100644 --- a/Makefile +++ b/Makefile @@ -1120,8 +1120,8 @@ LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif # Align the bit size of userspace programs with the kernel -KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) -KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) +KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) +KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) # make the checker run with the right architecture CHECKFLAGS += --arch=$(ARCH) -- GitLab From b28fb1f2ef45eeef1cd2c23149b50d184d545a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 13 Feb 2025 17:04:29 +0100 Subject: [PATCH 656/989] modpost: Fix a few typos in a comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Namely: s/becasue/because/ and s/wiht/with/ plus an added article. Signed-off-by: Uwe Kleine-König Signed-off-by: Masahiro Yamada --- scripts/mod/modpost.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 36b28987a2f07..c35d22607978b 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -190,8 +190,8 @@ static struct module *new_module(const char *name, size_t namelen) /* * Set mod->is_gpl_compatible to true by default. If MODULE_LICENSE() - * is missing, do not check the use for EXPORT_SYMBOL_GPL() becasue - * modpost will exit wiht error anyway. + * is missing, do not check the use for EXPORT_SYMBOL_GPL() because + * modpost will exit with an error anyway. */ mod->is_gpl_compatible = true; -- GitLab From 129fe718819cc5e24ea2f489db9ccd4371f0c6f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 11:55:47 -0500 Subject: [PATCH 657/989] tracing: Do not allow mmap() of persistent ring buffer When trying to mmap a trace instance buffer that is attached to reserve_mem, it would crash: BUG: unable to handle page fault for address: ffffe97bd00025c8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 2862f3067 P4D 2862f3067 PUD 0 Oops: Oops: 0000 [#1] PREEMPT_RT SMP PTI CPU: 4 UID: 0 PID: 981 Comm: mmap-rb Not tainted 6.14.0-rc2-test-00003-g7f1a5e3fbf9e-dirty #233 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:validate_page_before_insert+0x5/0xb0 Code: e2 01 89 d0 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 <48> 8b 46 08 a8 01 75 67 66 90 48 89 f0 8b 50 34 85 d2 74 76 48 89 RSP: 0018:ffffb148c2f3f968 EFLAGS: 00010246 RAX: ffff9fa5d3322000 RBX: ffff9fa5ccff9c08 RCX: 00000000b879ed29 RDX: ffffe97bd00025c0 RSI: ffffe97bd00025c0 RDI: ffff9fa5ccff9c08 RBP: ffffb148c2f3f9f0 R08: 0000000000000004 R09: 0000000000000004 R10: 0000000000000000 R11: 0000000000000200 R12: 0000000000000000 R13: 00007f16a18d5000 R14: ffff9fa5c48db6a8 R15: 0000000000000000 FS: 00007f16a1b54740(0000) GS:ffff9fa73df00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffe97bd00025c8 CR3: 00000001048c6006 CR4: 0000000000172ef0 Call Trace: ? __die_body.cold+0x19/0x1f ? __die+0x2e/0x40 ? page_fault_oops+0x157/0x2b0 ? search_module_extables+0x53/0x80 ? validate_page_before_insert+0x5/0xb0 ? kernelmode_fixup_or_oops.isra.0+0x5f/0x70 ? __bad_area_nosemaphore+0x16e/0x1b0 ? bad_area_nosemaphore+0x16/0x20 ? do_kern_addr_fault+0x77/0x90 ? exc_page_fault+0x22b/0x230 ? asm_exc_page_fault+0x2b/0x30 ? validate_page_before_insert+0x5/0xb0 ? vm_insert_pages+0x151/0x400 __rb_map_vma+0x21f/0x3f0 ring_buffer_map+0x21b/0x2f0 tracing_buffers_mmap+0x70/0xd0 __mmap_region+0x6f0/0xbd0 mmap_region+0x7f/0x130 do_mmap+0x475/0x610 vm_mmap_pgoff+0xf2/0x1d0 ksys_mmap_pgoff+0x166/0x200 __x64_sys_mmap+0x37/0x50 x64_sys_call+0x1670/0x1d70 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f The reason was that the code that maps the ring buffer pages to user space has: page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); And uses that in: vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); But virt_to_page() does not work with vmap()'d memory which is what the persistent ring buffer has. It is rather trivial to allow this, but for now just disable mmap() of instances that have their ring buffer from the reserve_mem option. If an mmap() is performed on a persistent buffer it will return -ENODEV just like it would if the .mmap field wasn't defined in the file_operations structure. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214115547.0d7287d3@gandalf.local.home Fixes: 9b7bdf6f6ece6 ("tracing: Have trace_printk not use binary prints if boot buffer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25ff37aab00f1..0e6d517e74e0f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8279,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; -- GitLab From 97937834ae876f29565415ab15f1284666dc6be3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Feb 2025 12:35:12 -0500 Subject: [PATCH 658/989] ring-buffer: Update pages_touched to reflect persistent buffer content The pages_touched field represents the number of subbuffers in the ring buffer that have content that can be read. This is used in accounting of "dirty_pages" and "buffer_percent" to allow the user to wait for the buffer to be filled to a certain amount before it reads the buffer in blocking mode. The persistent buffer never updated this value so it was set to zero, and this accounting would take it as it had no content. This would cause user space to wait for content even though there's enough content in the ring buffer that satisfies the buffer_percent. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Vincent Donnefort Link: https://lore.kernel.org/20250214123512.0631436e@gandalf.local.home Fixes: 5f3b6e839f3ce ("ring-buffer: Validate boot range memory events") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0419d41a20604..bb6089c2951e5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1850,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); -- GitLab From 2f69e54584475ac85ea0e3407c9198ac7c6ea8ad Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 14 Dec 2024 00:14:17 +0200 Subject: [PATCH 659/989] drm/msm/dpu: skip watchdog timer programming through TOP on >= SM8450 The SM8450 and later chips have DPU_MDP_PERIPH_0_REMOVED feature bit set, which means that those platforms have dropped some of the registers, including the WD TIMER-related ones. Stop providing the callback to program WD timer on those platforms. Fixes: 100d7ef6995d ("drm/msm/dpu: add support for SM8450") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/628874/ Link: https://lore.kernel.org/r/20241214-dpu-drop-features-v1-1-988f0662cb7e@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c index ad19330de61ab..562a3f4c5238a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_top.c @@ -272,7 +272,7 @@ static void _setup_mdp_ops(struct dpu_hw_mdp_ops *ops, if (cap & BIT(DPU_MDP_VSYNC_SEL)) ops->setup_vsync_source = dpu_hw_setup_vsync_sel; - else + else if (!(cap & BIT(DPU_MDP_PERIPH_0_REMOVED))) ops->setup_vsync_source = dpu_hw_setup_wd_timer; ops->get_safe_status = dpu_hw_get_safe_status; -- GitLab From af0a4a2090cce732c70ad6c5f4145b43f39e3fe9 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 14 Dec 2024 00:14:18 +0200 Subject: [PATCH 660/989] drm/msm/dpu: enable DPU_WB_INPUT_CTRL for DPU 5.x Several DPU 5.x platforms are supposed to be using DPU_WB_INPUT_CTRL, to bind WB and PINGPONG blocks, but they do not. Change those platforms to use WB_SM8250_MASK, which includes that bit. Fixes: 1f5bcc4316b3 ("drm/msm/dpu: enable writeback on SC8108X") Fixes: ab2b03d73a66 ("drm/msm/dpu: enable writeback on SM6125") Fixes: 47cebb740a83 ("drm/msm/dpu: enable writeback on SM8150") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/628876/ Link: https://lore.kernel.org/r/20241214-dpu-drop-features-v1-2-988f0662cb7e@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h | 2 +- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h | 2 +- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h index 421afacb72480..36cc9dbc00b5c 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h @@ -297,7 +297,7 @@ static const struct dpu_wb_cfg sm8150_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h index 641023b102bf5..e8eacdb47967a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h @@ -304,7 +304,7 @@ static const struct dpu_wb_cfg sc8180x_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h index d039b96beb97c..76f60a2df7a89 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_4_sm6125.h @@ -144,7 +144,7 @@ static const struct dpu_wb_cfg sm6125_wb[] = { { .name = "wb_2", .id = WB_2, .base = 0x65000, .len = 0x2c8, - .features = WB_SDM845_MASK, + .features = WB_SM8250_MASK, .format_list = wb2_formats_rgb, .num_formats = ARRAY_SIZE(wb2_formats_rgb), .clk_ctrl = DPU_CLK_CTRL_WB2, -- GitLab From 24b50b7340ab7e7b004ee6db43d625caa68498b0 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 17 Dec 2024 14:35:40 +0200 Subject: [PATCH 661/989] drm/msm/dpu: correct LM pairing for SM6150 The SM6150 platform doesn't have 3DMux (MERGE_3D) block, so it can not split the screen between two LMs. Drop lm_pair fields as they don't make sense for this platform. Suggested-by: Abhinav Kumar Fixes: cb2f9144693b ("drm/msm/dpu: Add SM6150 support") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/629377/ Link: https://lore.kernel.org/r/20241217-dpu-fix-sm6150-v2-1-9acc8f5addf3@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h index 621a2140f675f..d761ed705bac3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_3_sm6150.h @@ -116,14 +116,12 @@ static const struct dpu_lm_cfg sm6150_lm[] = { .sblk = &sdm845_lm_sblk, .pingpong = PINGPONG_0, .dspp = DSPP_0, - .lm_pair = LM_1, }, { .name = "lm_1", .id = LM_1, .base = 0x45000, .len = 0x320, .features = MIXER_QCM2290_MASK, .sblk = &sdm845_lm_sblk, .pingpong = PINGPONG_1, - .lm_pair = LM_0, }, { .name = "lm_2", .id = LM_2, .base = 0x46000, .len = 0x320, -- GitLab From df9cf852ca3099feb8fed781bdd5d3863af001c8 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar Date: Thu, 6 Feb 2025 11:46:36 -0800 Subject: [PATCH 662/989] drm/msm/dp: account for widebus and yuv420 during mode validation Widebus allows the DP controller to operate in 2 pixel per clock mode. The mode validation logic validates the mode->clock against the max DP pixel clock. However the max DP pixel clock limit assumes widebus is already enabled. Adjust the mode validation logic to only compare the adjusted pixel clock which accounts for widebus against the max DP pixel clock. Also fix the mode validation logic for YUV420 modes as in that case as well, only half the pixel clock is needed. Cc: stable@vger.kernel.org Fixes: 757a2f36ab09 ("drm/msm/dp: enable widebus feature for display port") Fixes: 6db6e5606576 ("drm/msm/dp: change clock related programming for YUV420 over DP") Reviewed-by: Dmitry Baryshkov Tested-by: Dale Whinham Patchwork: https://patchwork.freedesktop.org/patch/635789/ Link: https://lore.kernel.org/r/20250206-dp-widebus-fix-v2-1-cb89a0313286@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dp/dp_display.c | 11 ++++++----- drivers/gpu/drm/msm/dp/dp_drm.c | 5 ++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index d852e7a853348..a129e26c3ddb3 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -930,16 +930,17 @@ enum drm_mode_status msm_dp_bridge_mode_valid(struct drm_bridge *bridge, return -EINVAL; } - if (mode->clock > DP_MAX_PIXEL_CLK_KHZ) - return MODE_CLOCK_HIGH; - msm_dp_display = container_of(dp, struct msm_dp_display_private, msm_dp_display); link_info = &msm_dp_display->panel->link_info; - if (drm_mode_is_420_only(&dp->connector->display_info, mode) && - msm_dp_display->panel->vsc_sdp_supported) + if ((drm_mode_is_420_only(&dp->connector->display_info, mode) && + msm_dp_display->panel->vsc_sdp_supported) || + msm_dp_wide_bus_available(dp)) mode_pclk_khz /= 2; + if (mode_pclk_khz > DP_MAX_PIXEL_CLK_KHZ) + return MODE_CLOCK_HIGH; + mode_bpp = dp->connector->display_info.bpc * num_components; if (!mode_bpp) mode_bpp = default_bpp; diff --git a/drivers/gpu/drm/msm/dp/dp_drm.c b/drivers/gpu/drm/msm/dp/dp_drm.c index d3e241ea69416..16b7913d1eefa 100644 --- a/drivers/gpu/drm/msm/dp/dp_drm.c +++ b/drivers/gpu/drm/msm/dp/dp_drm.c @@ -257,7 +257,10 @@ static enum drm_mode_status msm_edp_bridge_mode_valid(struct drm_bridge *bridge, return -EINVAL; } - if (mode->clock > DP_MAX_PIXEL_CLK_KHZ) + if (msm_dp_wide_bus_available(dp)) + mode_pclk_khz /= 2; + + if (mode_pclk_khz > DP_MAX_PIXEL_CLK_KHZ) return MODE_CLOCK_HIGH; /* -- GitLab From 978ca99d6bd87b84ff7788eea4d2c328a70530f6 Mon Sep 17 00:00:00 2001 From: Ethan Carter Edwards Date: Sun, 9 Feb 2025 22:51:54 -0500 Subject: [PATCH 663/989] drm/msm/dpu: Fix uninitialized variable There is a possibility for an uninitialized *ret* variable to be returned in some code paths. Fix this by initializing *ret* to 0. Addresses-Coverity-ID: 1642546 ("Uninitialized scalar variable") Fixes: 774bcfb73176 ("drm/msm/dpu: add support for virtual planes") Signed-off-by: Ethan Carter Edwards Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/636201/ Link: https://lore.kernel.org/r/20250209-dpu-v2-1-114dfd4ebefd@ethancedwards.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index 098abc2c0003c..af3e541f60c30 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -1164,7 +1164,6 @@ int dpu_assign_plane_resources(struct dpu_global_state *global_state, unsigned int num_planes) { unsigned int i; - int ret; for (i = 0; i < num_planes; i++) { struct drm_plane_state *plane_state = states[i]; @@ -1173,13 +1172,13 @@ int dpu_assign_plane_resources(struct dpu_global_state *global_state, !plane_state->visible) continue; - ret = dpu_plane_virtual_assign_resources(crtc, global_state, + int ret = dpu_plane_virtual_assign_resources(crtc, global_state, state, plane_state); if (ret) - break; + return ret; } - return ret; + return 0; } static void dpu_plane_flush_csc(struct dpu_plane *pdpu, struct dpu_sw_pipe *pipe) -- GitLab From f063ac6b55df03ed25996bdc84d9e1c50147cfa1 Mon Sep 17 00:00:00 2001 From: Jessica Zhang Date: Tue, 11 Feb 2025 19:59:19 -0800 Subject: [PATCH 664/989] drm/msm/dpu: Disable dither in phys encoder cleanup Disable pingpong dither in dpu_encoder_helper_phys_cleanup(). This avoids the issue where an encoder unknowingly uses dither after reserving a pingpong block that was previously bound to an encoder that had enabled dither. Cc: stable@vger.kernel.org Reported-by: Dmitry Baryshkov Closes: https://lore.kernel.org/all/jr7zbj5w7iq4apg3gofuvcwf4r2swzqjk7sshwcdjll4mn6ctt@l2n3qfpujg3q/ Signed-off-by: Jessica Zhang Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Fixes: 3c128638a07d ("drm/msm/dpu: add support for dither block in display") Patchwork: https://patchwork.freedesktop.org/patch/636517/ Link: https://lore.kernel.org/r/20250211-dither-disable-v1-1-ac2cb455f6b9@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 5172ab4dea995..48e6e8d74c855 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -2281,6 +2281,9 @@ void dpu_encoder_helper_phys_cleanup(struct dpu_encoder_phys *phys_enc) } } + if (phys_enc->hw_pp && phys_enc->hw_pp->ops.setup_dither) + phys_enc->hw_pp->ops.setup_dither(phys_enc->hw_pp, NULL); + /* reset the merge 3D HW block */ if (phys_enc->hw_pp && phys_enc->hw_pp->merge_3d) { phys_enc->hw_pp->merge_3d->ops.setup_3d_mode(phys_enc->hw_pp->merge_3d, -- GitLab From 144429831f447223253a0e4376489f84ff37d1a7 Mon Sep 17 00:00:00 2001 From: Marijn Suijten Date: Tue, 11 Feb 2025 00:19:32 +0100 Subject: [PATCH 665/989] drm/msm/dpu: Don't leak bits_per_component into random DSC_ENC fields What used to be the input_10_bits boolean - feeding into the lowest bit of DSC_ENC - on MSM downstream turned into an accidental OR with the full bits_per_component number when it was ported to the upstream kernel. On typical bpc=8 setups we don't notice this because line_buf_depth is always an odd value (it contains bpc+1) and will also set the 4th bit after left-shifting by 3 (hence this |= bits_per_component is a no-op). Now that guards are being removed to allow more bits_per_component values besides 8 (possible since commit 49fd30a7153b ("drm/msm/dsi: use DRM DSC helpers for DSC setup")), a bpc of 10 will instead clash with the 5th bit which is convert_rgb. This is "fortunately" also always set to true by MSM's dsi_populate_dsc_params() already, but once a bpc of 12 starts being used it'll write into simple_422 which is normally false. To solve all these overlaps, simply replicate downstream code and only set this lowest bit if bits_per_component is equal to 10. It is unclear why DSC requires this only for bpc=10 but not bpc=12, and also notice that this lowest bit wasn't set previously despite having a panel and patch on the list using it without any mentioned issues. Fixes: c110cfd1753e ("drm/msm/disp/dpu1: Add support for DSC") Signed-off-by: Marijn Suijten Reviewed-by: Abhinav Kumar Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/636311/ Link: https://lore.kernel.org/r/20250211-dsc-10-bit-v1-1-1c85a9430d9a@somainline.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c index 657200401f576..cec6d4e8baec4 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c @@ -52,6 +52,7 @@ static void dpu_hw_dsc_config(struct dpu_hw_dsc *hw_dsc, u32 slice_last_group_size; u32 det_thresh_flatness; bool is_cmd_mode = !(mode & DSC_MODE_VIDEO); + bool input_10_bits = dsc->bits_per_component == 10; DPU_REG_WRITE(c, DSC_COMMON_MODE, mode); @@ -68,7 +69,7 @@ static void dpu_hw_dsc_config(struct dpu_hw_dsc *hw_dsc, data |= (dsc->line_buf_depth << 3); data |= (dsc->simple_422 << 2); data |= (dsc->convert_rgb << 1); - data |= dsc->bits_per_component; + data |= input_10_bits; DPU_REG_WRITE(c, DSC_ENC, data); -- GitLab From 5e192eefebaab5bdcf716add8910d7f8a2e30e3c Mon Sep 17 00:00:00 2001 From: Jessica Zhang Date: Wed, 8 Jan 2025 14:40:48 -0800 Subject: [PATCH 666/989] drm/msm/dpu: Drop extraneous return in dpu_crtc_reassign_planes() Drop extra return at the end of dpu_crtc_reassign_planes() Fixes: 774bcfb73176 ("drm/msm/dpu: add support for virtual planes") Signed-off-by: Jessica Zhang Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/631565/ Link: https://lore.kernel.org/r/20250108-virtual-planes-fixes-v1-2-420cb36df94a@quicinc.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index 7191b1a6d41b3..e5dcd41a361f4 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -1228,8 +1228,6 @@ static int dpu_crtc_reassign_planes(struct drm_crtc *crtc, struct drm_crtc_state done: kfree(states); return ret; - - return 0; } static int dpu_crtc_atomic_check(struct drm_crtc *crtc, -- GitLab From 588257897058a0b1aa47912db4fe93c6ff5e3887 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 14 Feb 2025 16:08:41 +0100 Subject: [PATCH 667/989] drm/msm/dsi/phy: Protect PHY_CMN_CLK_CFG0 updated from driver side PHY_CMN_CLK_CFG0 register is updated by the PHY driver and by two divider clocks from Common Clock Framework: devm_clk_hw_register_divider_parent_hw(). Concurrent access by the clocks side is protected with spinlock, however driver's side in restoring state is not. Restoring state is called from msm_dsi_phy_enable(), so there could be a path leading to concurrent and conflicting updates with clock framework. Add missing lock usage on the PHY driver side, encapsulated in its own function so the code will be still readable. While shuffling the code, define and use PHY_CMN_CLK_CFG0 bitfields to make the code more readable and obvious. Fixes: 1ef7c99d145c ("drm/msm/dsi: add support for 7nm DSI PHY/PLL") Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Signed-off-by: Krzysztof Kozlowski Patchwork: https://patchwork.freedesktop.org/patch/637376/ Link: https://lore.kernel.org/r/20250214-drm-msm-phy-pll-cfg-reg-v3-1-0943b850722c@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c | 14 ++++++++++++-- .../gpu/drm/msm/registers/display/dsi_phy_7nm.xml | 5 ++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c index 031446c87daec..25ca649de717e 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c @@ -372,6 +372,15 @@ static void dsi_pll_enable_pll_bias(struct dsi_pll_7nm *pll) ndelay(250); } +static void dsi_pll_cmn_clk_cfg0_write(struct dsi_pll_7nm *pll, u32 val) +{ + unsigned long flags; + + spin_lock_irqsave(&pll->postdiv_lock, flags); + writel(val, pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG0); + spin_unlock_irqrestore(&pll->postdiv_lock, flags); +} + static void dsi_pll_disable_global_clk(struct dsi_pll_7nm *pll) { u32 data; @@ -574,8 +583,9 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) val |= cached->pll_out_div; writel(val, pll_7nm->phy->pll_base + REG_DSI_7nm_PHY_PLL_PLL_OUTDIV_RATE); - writel(cached->bit_clk_div | (cached->pix_clk_div << 4), - phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG0); + dsi_pll_cmn_clk_cfg0_write(pll_7nm, + DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_3_0(cached->bit_clk_div) | + DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_7_4(cached->pix_clk_div)); val = readl(phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); val &= ~0x3; diff --git a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml index d54b72f924493..e0bf6e016b4ce 100644 --- a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml +++ b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml @@ -9,7 +9,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd"> - + + + + -- GitLab From 5a97bc924ae0804b8dbf627e357acaa5ef761483 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 14 Feb 2025 16:08:42 +0100 Subject: [PATCH 668/989] drm/msm/dsi/phy: Protect PHY_CMN_CLK_CFG1 against clock driver PHY_CMN_CLK_CFG1 register is updated by the PHY driver and by a mux clock from Common Clock Framework: devm_clk_hw_register_mux_parent_hws(). There could be a path leading to concurrent and conflicting updates between PHY driver and clock framework, e.g. changing the mux and enabling PLL clocks. Add dedicated spinlock to be sure all PHY_CMN_CLK_CFG1 updates are synchronized. While shuffling the code, define and use PHY_CMN_CLK_CFG1 bitfields to make the code more readable and obvious. Fixes: 1ef7c99d145c ("drm/msm/dsi: add support for 7nm DSI PHY/PLL") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/637378/ Link: https://lore.kernel.org/r/20250214-drm-msm-phy-pll-cfg-reg-v3-2-0943b850722c@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c | 35 ++++++++++++------- .../drm/msm/registers/display/dsi_phy_7nm.xml | 5 ++- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c index 25ca649de717e..388017db45d80 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c @@ -83,6 +83,9 @@ struct dsi_pll_7nm { /* protects REG_DSI_7nm_PHY_CMN_CLK_CFG0 register */ spinlock_t postdiv_lock; + /* protects REG_DSI_7nm_PHY_CMN_CLK_CFG1 register */ + spinlock_t pclk_mux_lock; + struct pll_7nm_cached_state cached_state; struct dsi_pll_7nm *slave; @@ -381,22 +384,32 @@ static void dsi_pll_cmn_clk_cfg0_write(struct dsi_pll_7nm *pll, u32 val) spin_unlock_irqrestore(&pll->postdiv_lock, flags); } -static void dsi_pll_disable_global_clk(struct dsi_pll_7nm *pll) +static void dsi_pll_cmn_clk_cfg1_update(struct dsi_pll_7nm *pll, u32 mask, + u32 val) { + unsigned long flags; u32 data; + spin_lock_irqsave(&pll->pclk_mux_lock, flags); data = readl(pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - writel(data & ~BIT(5), pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + data &= ~mask; + data |= val & mask; + + writel(data, pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + spin_unlock_irqrestore(&pll->pclk_mux_lock, flags); +} + +static void dsi_pll_disable_global_clk(struct dsi_pll_7nm *pll) +{ + dsi_pll_cmn_clk_cfg1_update(pll, DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN, 0); } static void dsi_pll_enable_global_clk(struct dsi_pll_7nm *pll) { - u32 data; + u32 cfg_1 = DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN | DSI_7nm_PHY_CMN_CLK_CFG1_CLK_EN_SEL; writel(0x04, pll->phy->base + REG_DSI_7nm_PHY_CMN_CTRL_3); - - data = readl(pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - writel(data | BIT(5) | BIT(4), pll->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg1_update(pll, cfg_1, cfg_1); } static void dsi_pll_phy_dig_reset(struct dsi_pll_7nm *pll) @@ -574,7 +587,6 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) { struct dsi_pll_7nm *pll_7nm = to_pll_7nm(phy->vco_hw); struct pll_7nm_cached_state *cached = &pll_7nm->cached_state; - void __iomem *phy_base = pll_7nm->phy->base; u32 val; int ret; @@ -586,11 +598,7 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) dsi_pll_cmn_clk_cfg0_write(pll_7nm, DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_3_0(cached->bit_clk_div) | DSI_7nm_PHY_CMN_CLK_CFG0_DIV_CTRL_7_4(cached->pix_clk_div)); - - val = readl(phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); - val &= ~0x3; - val |= cached->pll_mux; - writel(val, phy_base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg1_update(pll_7nm, 0x3, cached->pll_mux); ret = dsi_pll_7nm_vco_set_rate(phy->vco_hw, pll_7nm->vco_current_rate, @@ -743,7 +751,7 @@ static int pll_7nm_register(struct dsi_pll_7nm *pll_7nm, struct clk_hw **provide pll_by_2_bit, }), 2, 0, pll_7nm->phy->base + REG_DSI_7nm_PHY_CMN_CLK_CFG1, - 0, 1, 0, NULL); + 0, 1, 0, &pll_7nm->pclk_mux_lock); if (IS_ERR(hw)) { ret = PTR_ERR(hw); goto fail; @@ -788,6 +796,7 @@ static int dsi_pll_7nm_init(struct msm_dsi_phy *phy) pll_7nm_list[phy->id] = pll_7nm; spin_lock_init(&pll_7nm->postdiv_lock); + spin_lock_init(&pll_7nm->pclk_mux_lock); pll_7nm->phy = phy; diff --git a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml index e0bf6e016b4ce..cfaf78c028b13 100644 --- a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml +++ b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml @@ -13,7 +13,10 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd"> - + + + + -- GitLab From 73f69c6be2a9f22c31c775ec03c6c286bfe12cfa Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 14 Feb 2025 16:08:43 +0100 Subject: [PATCH 669/989] drm/msm/dsi/phy: Do not overwite PHY_CMN_CLK_CFG1 when choosing bitclk source PHY_CMN_CLK_CFG1 register has four fields being used in the driver: DSI clock divider, source of bitclk and two for enabling the DSI PHY PLL clocks. dsi_7nm_set_usecase() sets only the source of bitclk, so should leave all other bits untouched. Use newly introduced dsi_pll_cmn_clk_cfg1_update() to update respective bits without overwriting the rest. While shuffling the code, define and use PHY_CMN_CLK_CFG1 bitfields to make the code more readable and obvious. Fixes: 1ef7c99d145c ("drm/msm/dsi: add support for 7nm DSI PHY/PLL") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/637380/ Link: https://lore.kernel.org/r/20250214-drm-msm-phy-pll-cfg-reg-v3-3-0943b850722c@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c | 4 ++-- drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c index 388017db45d80..798168180c1ab 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_7nm.c @@ -617,7 +617,6 @@ static int dsi_7nm_pll_restore_state(struct msm_dsi_phy *phy) static int dsi_7nm_set_usecase(struct msm_dsi_phy *phy) { struct dsi_pll_7nm *pll_7nm = to_pll_7nm(phy->vco_hw); - void __iomem *base = phy->base; u32 data = 0x0; /* internal PLL */ DBG("DSI PLL%d", pll_7nm->phy->id); @@ -636,7 +635,8 @@ static int dsi_7nm_set_usecase(struct msm_dsi_phy *phy) } /* set PLL src */ - writel(data << 2, base + REG_DSI_7nm_PHY_CMN_CLK_CFG1); + dsi_pll_cmn_clk_cfg1_update(pll_7nm, DSI_7nm_PHY_CMN_CLK_CFG1_BITCLK_SEL__MASK, + DSI_7nm_PHY_CMN_CLK_CFG1_BITCLK_SEL(data)); return 0; } diff --git a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml index cfaf78c028b13..35f7f40e405b7 100644 --- a/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml +++ b/drivers/gpu/drm/msm/registers/display/dsi_phy_7nm.xml @@ -16,6 +16,7 @@ xsi:schemaLocation="https://gitlab.freedesktop.org/freedreno/ rules-fd.xsd"> + -- GitLab From 0ad2507d5d93f39619fc42372c347d6006b64319 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Feb 2025 14:02:44 -0800 Subject: [PATCH 670/989] Linux 6.14-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 272db408be5ca..96407c1d6be16 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* -- GitLab From 654292a0b264e9b8c51b98394146218a21612aa1 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sun, 16 Feb 2025 18:02:47 -0300 Subject: [PATCH 671/989] smb: client: fix chmod(2) regression with ATTR_READONLY When the user sets a file or directory as read-only (e.g. ~S_IWUGO), the client will set the ATTR_READONLY attribute by sending an SMB2_SET_INFO request to the server in cifs_setattr_{,nounix}(), but cifsInodeInfo::cifsAttrs will be left unchanged as the client will only update the new file attributes in the next call to {smb311_posix,cifs}_get_inode_info() with the new metadata filled in @data parameter. Commit a18280e7fdea ("smb: cilent: set reparse mount points as automounts") mistakenly removed the @data NULL check when calling is_inode_cache_good(), which broke the above case as the new ATTR_READONLY attribute would end up not being updated on files with a read lease. Fix this by updating the inode whenever we have cached metadata in @data parameter. Reported-by: Horst Reiterer Closes: https://lore.kernel.org/r/85a16504e09147a195ac0aac1c801280@fabasoft.com Fixes: a18280e7fdea ("smb: cilent: set reparse mount points as automounts") Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 9cc31cf6ebd07..3261190e6f903 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1408,7 +1408,7 @@ int cifs_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } @@ -1507,7 +1507,7 @@ int smb311_posix_get_inode_info(struct inode **inode, struct cifs_fattr fattr = {}; int rc; - if (is_inode_cache_good(*inode)) { + if (!data && is_inode_cache_good(*inode)) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); return 0; } -- GitLab From 7fb39882b20c98a9a393c244c86b56ef6933cff8 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Sun, 16 Feb 2025 11:42:09 +0100 Subject: [PATCH 672/989] dm-integrity: Avoid divide by zero in table status in Inline mode In Inline mode, the journal is unused, and journal_sectors is zero. Calculating the journal watermark requires dividing by journal_sectors, which should be done only if the journal is configured. Otherwise, a simple table query (dmsetup table) can cause OOPS. This bug did not show on some systems, perhaps only due to compiler optimization. On my 32-bit testing machine, this reliably crashes with the following: : Oops: divide error: 0000 [#1] PREEMPT SMP : CPU: 0 UID: 0 PID: 2450 Comm: dmsetup Not tainted 6.14.0-rc2+ #959 : EIP: dm_integrity_status+0x2f8/0xab0 [dm_integrity] ... Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka Fixes: fb0987682c62 ("dm-integrity: introduce the Inline mode") Cc: stable@vger.kernel.org # 6.11+ --- drivers/md/dm-integrity.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e0..555dc06b94228 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3790,10 +3790,6 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: { - __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; - - watermark_percentage += ic->journal_entries / 2; - do_div(watermark_percentage, ic->journal_entries); arg_count = 3; arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; @@ -3826,6 +3822,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); if (ic->mode == 'J') { + __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; + + watermark_percentage += ic->journal_entries / 2; + do_div(watermark_percentage, ic->journal_entries); DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } -- GitLab From c19525b5fb71a355079063bb14adcddae60cf922 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Sun, 16 Feb 2025 11:42:10 +0100 Subject: [PATCH 673/989] dm-integrity: Do not emit journal configuration in DM table for Inline mode The Inline mode does not use a journal; it makes no sense to print journal information in DM table. Print it only if the journal is used. The same applies to interleave_sectors (unused for Inline mode). Also, add comments for arg_count, as the current calculation is quite obscure. Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 555dc06b94228..c45464b6576aa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3790,16 +3790,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: { - arg_count = 3; + arg_count = 1; /* buffer_sectors */ arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += ic->reset_recalculate_flag; arg_count += ic->discard; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'B'; - arg_count += ic->mode == 'B'; + arg_count += ic->mode != 'I'; /* interleave_sectors */ + arg_count += ic->mode == 'J'; /* journal_sectors */ + arg_count += ic->mode == 'J'; /* journal_watermark */ + arg_count += ic->mode == 'J'; /* commit_time */ + arg_count += ic->mode == 'B'; /* sectors_per_bit */ + arg_count += ic->mode == 'B'; /* bitmap_flush_interval */ arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -3818,14 +3820,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" reset_recalculate"); if (ic->discard) DMEMIT(" allow_discards"); - DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); - DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + if (ic->mode != 'I') + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); if (ic->mode == 'J') { __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } -- GitLab From 9d846b1aebbe488f245f1aa463802ff9c34cc078 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 10 Feb 2025 11:51:55 +0100 Subject: [PATCH 674/989] gpiolib: check the return value of gpio_chip::get_direction() As per the API contract - gpio_chip::get_direction() may fail and return a negative error number. However, we treat it as if it always returned 0 or 1. Check the return value of the callback and propagate the error number up the stack. Cc: stable@vger.kernel.org Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250210-gpio-sanitize-retvals-v1-1-12ea88506cb2@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 44 ++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index ca2f58a2cd45e..29110dc436f15 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1057,8 +1057,11 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, desc->gdev = gdev; if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) { - assign_bit(FLAG_IS_OUT, - &desc->flags, !gc->get_direction(gc, desc_index)); + ret = gc->get_direction(gc, desc_index); + if (ret < 0) + goto err_cleanup_desc_srcu; + + assign_bit(FLAG_IS_OUT, &desc->flags, !ret); } else { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->direction_input); @@ -2728,13 +2731,18 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) if (guard.gc->direction_input) { ret = guard.gc->direction_input(guard.gc, gpio_chip_hwgpio(desc)); - } else if (guard.gc->get_direction && - (guard.gc->get_direction(guard.gc, - gpio_chip_hwgpio(desc)) != 1)) { - gpiod_warn(desc, - "%s: missing direction_input() operation and line is output\n", - __func__); - return -EIO; + } else if (guard.gc->get_direction) { + ret = guard.gc->get_direction(guard.gc, + gpio_chip_hwgpio(desc)); + if (ret < 0) + return ret; + + if (ret != GPIO_LINE_DIRECTION_IN) { + gpiod_warn(desc, + "%s: missing direction_input() operation and line is output\n", + __func__); + return -EIO; + } } if (ret == 0) { clear_bit(FLAG_IS_OUT, &desc->flags); @@ -2771,12 +2779,18 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) gpio_chip_hwgpio(desc), val); } else { /* Check that we are in output mode if we can */ - if (guard.gc->get_direction && - guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc))) { - gpiod_warn(desc, - "%s: missing direction_output() operation\n", - __func__); - return -EIO; + if (guard.gc->get_direction) { + ret = guard.gc->get_direction(guard.gc, + gpio_chip_hwgpio(desc)); + if (ret < 0) + return ret; + + if (ret != GPIO_LINE_DIRECTION_OUT) { + gpiod_warn(desc, + "%s: missing direction_output() operation\n", + __func__); + return -EIO; + } } /* * If we can't actively set the direction, we are some -- GitLab From 4e667a1968099c6deadee2313ecd648f8f0a8956 Mon Sep 17 00:00:00 2001 From: Johan Korsnes Date: Mon, 17 Feb 2025 10:16:43 +0100 Subject: [PATCH 675/989] gpio: vf610: add locking to gpio direction functions Add locking to `vf610_gpio_direction_input|output()` functions. Without this locking, a race condition exists between concurrent calls to these functions, potentially leading to incorrect GPIO direction settings. To verify the correctness of this fix, a `trylock` patch was applied, where after a couple of reboots the race was confirmed. I.e., one user had to wait before acquiring the lock. With this patch the race has not been encountered. It's worth mentioning that any type of debugging (printing, tracing, etc.) would "resolve"/hide the issue. Fixes: 659d8a62311f ("gpio: vf610: add imx7ulp support") Signed-off-by: Johan Korsnes Reviewed-by: Linus Walleij Reviewed-by: Haibo Chen Cc: Bartosz Golaszewski Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250217091643.679644-1-johan.korsnes@remarkable.no Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-vf610.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index c4f34a347cb6e..c36a9dbccd4dd 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -36,6 +36,7 @@ struct vf610_gpio_port { struct clk *clk_port; struct clk *clk_gpio; int irq; + spinlock_t lock; /* protect gpio direction registers */ }; #define GPIO_PDOR 0x00 @@ -124,6 +125,7 @@ static int vf610_gpio_direction_input(struct gpio_chip *chip, unsigned int gpio) u32 val; if (port->sdata->have_paddr) { + guard(spinlock_irqsave)(&port->lock); val = vf610_gpio_readl(port->gpio_base + GPIO_PDDR); val &= ~mask; vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR); @@ -142,6 +144,7 @@ static int vf610_gpio_direction_output(struct gpio_chip *chip, unsigned int gpio vf610_gpio_set(chip, gpio, value); if (port->sdata->have_paddr) { + guard(spinlock_irqsave)(&port->lock); val = vf610_gpio_readl(port->gpio_base + GPIO_PDDR); val |= mask; vf610_gpio_writel(val, port->gpio_base + GPIO_PDDR); @@ -297,6 +300,7 @@ static int vf610_gpio_probe(struct platform_device *pdev) return -ENOMEM; port->sdata = device_get_match_data(dev); + spin_lock_init(&port->lock); dual_base = port->sdata->have_dual_base; -- GitLab From c7db342e3b4744688be1e27e31254c1d31a35274 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:08 +0100 Subject: [PATCH 676/989] riscv: KVM: Fix hart suspend status check "Not stopped" means started or suspended so we need to check for a single state in order to have a chance to check for each state. Also, we need to use target_vcpu when checking for the suspend state. Fixes: 763c8bed8c05 ("RISC-V: KVM: Implement SBI HSM suspend call") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-8-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_hsm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index dce667f4b6ab0..13a35eb77e8e3 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -79,12 +79,12 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) return SBI_ERR_INVALID_PARAM; - if (!kvm_riscv_vcpu_stopped(target_vcpu)) - return SBI_HSM_STATE_STARTED; - else if (vcpu->stat.generic.blocking) + if (kvm_riscv_vcpu_stopped(target_vcpu)) + return SBI_HSM_STATE_STOPPED; + else if (target_vcpu->stat.generic.blocking) return SBI_HSM_STATE_SUSPENDED; else - return SBI_HSM_STATE_STOPPED; + return SBI_HSM_STATE_STARTED; } static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, -- GitLab From e3219b0c491f2aa0e0b200a39d3352ab05cdda96 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:09 +0100 Subject: [PATCH 677/989] riscv: KVM: Fix hart suspend_type use The spec says suspend_type is 32 bits wide and "In case the data is defined as 32bit wide, higher privilege software must ensure that it only uses 32 bit data." Mask off upper bits of suspend_type before using it. Fixes: 763c8bed8c05 ("RISC-V: KVM: Implement SBI HSM suspend call") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-9-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_hsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 13a35eb77e8e3..3070bb31745de 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -109,7 +110,7 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, } return 0; case SBI_EXT_HSM_HART_SUSPEND: - switch (cp->a0) { + switch (lower_32_bits(cp->a0)) { case SBI_HSM_SUSPEND_RET_DEFAULT: kvm_riscv_vcpu_wfi(vcpu); break; -- GitLab From 0611f78f83c93c000029ab01daa28166d03590ed Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:10 +0100 Subject: [PATCH 678/989] riscv: KVM: Fix SBI IPI error generation When an invalid function ID of an SBI extension is used we should return not-supported, not invalid-param. Also, when we see that at least one hartid constructed from the base and mask parameters is invalid, then we should return invalid-param. Finally, rather than relying on overflowing a left shift to result in zero and then using that zero in a condition which [correctly] skips sending an IPI (but loops unnecessarily), explicitly check for overflow and exit the loop immediately. Fixes: 5f862df5585c ("RISC-V: KVM: Add v0.1 replacement SBI extensions defined in v0.2") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-10-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9c2ab3dfa93aa..74e3a38c6a29e 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -51,9 +51,10 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; + unsigned long hart_bit = 0, sentmask = 0; if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } @@ -62,15 +63,23 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (hbase != -1UL) { if (tmp->vcpu_id < hbase) continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) + hart_bit = tmp->vcpu_id - hbase; + if (hart_bit >= __riscv_xlen) + goto done; + if (!(hmask & (1UL << hart_bit))) continue; } ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); if (ret < 0) break; + sentmask |= 1UL << hart_bit; kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); } +done: + if (hbase != -1UL && (hmask ^ sentmask)) + retdata->err_val = SBI_ERR_INVALID_PARAM; + return ret; } -- GitLab From b901484852992cf3d162a5eab72251cc813ca624 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:11 +0100 Subject: [PATCH 679/989] riscv: KVM: Fix SBI TIME error generation When an invalid function ID of an SBI extension is used we should return not-supported, not invalid-param. Fixes: 5f862df5585c ("RISC-V: KVM: Add v0.1 replacement SBI extensions defined in v0.2") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-11-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 74e3a38c6a29e..5fbf3f94f1e85 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -21,7 +21,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, u64 next_cycle; if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } -- GitLab From 351e02b1733b057e33fe13fc03ca93ec799e4f78 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:12 +0100 Subject: [PATCH 680/989] riscv: KVM: Fix SBI sleep_type use The spec says sleep_type is 32 bits wide and "In case the data is defined as 32bit wide, higher privilege software must ensure that it only uses 32 bit data." Mask off upper bits of sleep_type before using it. Fixes: 023c15151fbb ("RISC-V: KVM: Add SBI system suspend support") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-12-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_system.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c index 5d55e08791fa1..bc0ebba890037 100644 --- a/arch/riscv/kvm/vcpu_sbi_system.c +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -19,7 +20,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, switch (funcid) { case SBI_EXT_SUSP_SYSTEM_SUSPEND: - if (cp->a0 != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { + if (lower_32_bits(cp->a0) != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { retdata->err_val = SBI_ERR_INVALID_PARAM; return 0; } -- GitLab From 539bd20352832b9244238a055eb169ccf1c41ff6 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Thu, 13 Feb 2025 11:15:46 +0530 Subject: [PATCH 681/989] mtd: spi-nor: sst: Fix SST write failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'commit 18bcb4aa54ea ("mtd: spi-nor: sst: Factor out common write operation to `sst_nor_write_data()`")' introduced a bug where only one byte of data is written, regardless of the number of bytes passed to sst_nor_write_data(), causing a kernel crash during the write operation. Ensure the correct number of bytes are written as passed to sst_nor_write_data(). Call trace: [ 57.400180] ------------[ cut here ]------------ [ 57.404842] While writing 2 byte written 1 bytes [ 57.409493] WARNING: CPU: 0 PID: 737 at drivers/mtd/spi-nor/sst.c:187 sst_nor_write_data+0x6c/0x74 [ 57.418464] Modules linked in: [ 57.421517] CPU: 0 UID: 0 PID: 737 Comm: mtd_debug Not tainted 6.12.0-g5ad04afd91f9 #30 [ 57.429517] Hardware name: Xilinx Versal A2197 Processor board revA - x-prc-02 revA (DT) [ 57.437600] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 57.444557] pc : sst_nor_write_data+0x6c/0x74 [ 57.448911] lr : sst_nor_write_data+0x6c/0x74 [ 57.453264] sp : ffff80008232bb40 [ 57.456570] x29: ffff80008232bb40 x28: 0000000000010000 x27: 0000000000000001 [ 57.463708] x26: 000000000000ffff x25: 0000000000000000 x24: 0000000000000000 [ 57.470843] x23: 0000000000010000 x22: ffff80008232bbf0 x21: ffff000816230000 [ 57.477978] x20: ffff0008056c0080 x19: 0000000000000002 x18: 0000000000000006 [ 57.485112] x17: 0000000000000000 x16: 0000000000000000 x15: ffff80008232b580 [ 57.492246] x14: 0000000000000000 x13: ffff8000816d1530 x12: 00000000000004a4 [ 57.499380] x11: 000000000000018c x10: ffff8000816fd530 x9 : ffff8000816d1530 [ 57.506515] x8 : 00000000fffff7ff x7 : ffff8000816fd530 x6 : 0000000000000001 [ 57.513649] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000 [ 57.520782] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0008049b0000 [ 57.527916] Call trace: [ 57.530354] sst_nor_write_data+0x6c/0x74 [ 57.534361] sst_nor_write+0xb4/0x18c [ 57.538019] mtd_write_oob_std+0x7c/0x88 [ 57.541941] mtd_write_oob+0x70/0xbc [ 57.545511] mtd_write+0x68/0xa8 [ 57.548733] mtdchar_write+0x10c/0x290 [ 57.552477] vfs_write+0xb4/0x3a8 [ 57.555791] ksys_write+0x74/0x10c [ 57.559189] __arm64_sys_write+0x1c/0x28 [ 57.563109] invoke_syscall+0x54/0x11c [ 57.566856] el0_svc_common.constprop.0+0xc0/0xe0 [ 57.571557] do_el0_svc+0x1c/0x28 [ 57.574868] el0_svc+0x30/0xcc [ 57.577921] el0t_64_sync_handler+0x120/0x12c [ 57.582276] el0t_64_sync+0x190/0x194 [ 57.585933] ---[ end trace 0000000000000000 ]--- Cc: stable@vger.kernel.org Fixes: 18bcb4aa54ea ("mtd: spi-nor: sst: Factor out common write operation to `sst_nor_write_data()`") Signed-off-by: Amit Kumar Mahapatra Reviewed-by: Pratyush Yadav Reviewed-by: Tudor Ambarus Reviewed-by: Bence Csókás [pratyush@kernel.org: add Cc stable tag] Signed-off-by: Pratyush Yadav Link: https://lore.kernel.org/r/20250213054546.2078121-1-amit.kumar-mahapatra@amd.com --- drivers/mtd/spi-nor/sst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/spi-nor/sst.c b/drivers/mtd/spi-nor/sst.c index b5ad7118c49a2..175211fe6a5ed 100644 --- a/drivers/mtd/spi-nor/sst.c +++ b/drivers/mtd/spi-nor/sst.c @@ -174,7 +174,7 @@ static int sst_nor_write_data(struct spi_nor *nor, loff_t to, size_t len, int ret; nor->program_opcode = op; - ret = spi_nor_write_data(nor, to, 1, buf); + ret = spi_nor_write_data(nor, to, len, buf); if (ret < 0) return ret; WARN(ret != len, "While writing %zu byte written %i bytes\n", len, ret); -- GitLab From e49477f7f78598295551d486ecc7f020d796432e Mon Sep 17 00:00:00 2001 From: Krzysztof Karas Date: Thu, 16 Jan 2025 10:40:46 +0000 Subject: [PATCH 682/989] drm/i915/gt: Use spin_lock_irqsave() in interruptible context spin_lock/unlock() functions used in interrupt contexts could result in a deadlock, as seen in GitLab issue #13399, which occurs when interrupt comes in while holding a lock. Try to remedy the problem by saving irq state before spin lock acquisition. v2: add irqs' state save/restore calls to all locks/unlocks in signal_irq_work() execution (Maciej) v3: use with spin_lock_irqsave() in guc_lrc_desc_unpin() instead of other lock/unlock calls and add Fixes and Cc tags (Tvrtko); change title and commit message Fixes: 2f2cc53b5fe7 ("drm/i915/guc: Close deregister-context race against CT-loss") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13399 Signed-off-by: Krzysztof Karas Cc: # v6.9+ Reviewed-by: Maciej Patelczyk Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/pusppq5ybyszau2oocboj3mtj5x574gwij323jlclm5zxvimmu@mnfg6odxbpsv (cherry picked from commit c088387ddd6482b40f21ccf23db1125e8fa4af7e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index cc05bd9e43b49..3fce5c0001444 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -3449,10 +3449,10 @@ static inline int guc_lrc_desc_unpin(struct intel_context *ce) */ ret = deregister_context(ce, ce->guc_id.id); if (ret) { - spin_lock(&ce->guc_state.lock); + spin_lock_irqsave(&ce->guc_state.lock, flags); set_context_registered(ce); clr_context_destroyed(ce); - spin_unlock(&ce->guc_state.lock); + spin_unlock_irqrestore(&ce->guc_state.lock, flags); /* * As gt-pm is awake at function entry, intel_wakeref_put_async merely decrements * the wakeref immediately but per function spec usage call this after unlock. -- GitLab From 0c455f3a12298e9c89a78d2f3327e15e52c0adc5 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 13 Feb 2025 11:28:59 -0800 Subject: [PATCH 683/989] drm/xe: Fix error handling in xe_irq_install() When devm_add_action_or_reset() fails, it already calls the function passed as parameter and that function is already free'ing the irqs. Drop the goto and just return. The caller, xe_device_probe(), should also do the same thing instead of wrongly doing `goto err` and calling the unrelated xe_display_fini() function. Fixes: 14d25d8d684d ("drm/xe: change old msi irq api to a new one") Reviewed-by: Rodrigo Vivi Reviewed-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20250213192909.996148-3-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 121b214cdf10d4129b64f2b1f31807154c74ae55) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_irq.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c index 32f5a67a917b5..08552ee3fb94b 100644 --- a/drivers/gpu/drm/xe/xe_irq.c +++ b/drivers/gpu/drm/xe/xe_irq.c @@ -757,19 +757,7 @@ int xe_irq_install(struct xe_device *xe) xe_irq_postinstall(xe); - err = devm_add_action_or_reset(xe->drm.dev, irq_uninstall, xe); - if (err) - goto free_irq_handler; - - return 0; - -free_irq_handler: - if (xe_device_has_msix(xe)) - xe_irq_msix_free(xe); - else - xe_irq_msi_free(xe); - - return err; + return devm_add_action_or_reset(xe->drm.dev, irq_uninstall, xe); } static void xe_irq_msi_synchronize_irq(struct xe_device *xe) -- GitLab From 879f70382ff3e92fc854589ada3453e3f5f5b601 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 14 Feb 2025 16:19:51 +0200 Subject: [PATCH 684/989] drm/i915/dsi: Use TRANS_DDI_FUNC_CTL's own port width macro The format of the port width field in the DDI_BUF_CTL and the TRANS_DDI_FUNC_CTL registers are different starting with MTL, where the x3 lane mode for HDMI FRL has a different encoding in the two registers. To account for this use the TRANS_DDI_FUNC_CTL's own port width macro. Cc: # v6.5+ Fixes: b66a8abaa48a ("drm/i915/display/mtl: Fill port width in DDI_BUF_/TRANS_DDI_FUNC_/PORT_BUF_CTL for HDMI") Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250214142001.552916-2-imre.deak@intel.com (cherry picked from commit 76120b3a304aec28fef4910204b81a12db8974da) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/icl_dsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index c977b74f82f0b..82bf6c654de25 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -809,8 +809,8 @@ gen11_dsi_configure_transcoder(struct intel_encoder *encoder, /* select data lane width */ tmp = intel_de_read(display, TRANS_DDI_FUNC_CTL(display, dsi_trans)); - tmp &= ~DDI_PORT_WIDTH_MASK; - tmp |= DDI_PORT_WIDTH(intel_dsi->lane_count); + tmp &= ~TRANS_DDI_PORT_WIDTH_MASK; + tmp |= TRANS_DDI_PORT_WIDTH(intel_dsi->lane_count); /* select input pipe */ tmp &= ~TRANS_DDI_EDP_INPUT_MASK; -- GitLab From 166ce267ae3f96e439d8ccc838e8ec4d8b4dab73 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 14 Feb 2025 16:19:52 +0200 Subject: [PATCH 685/989] drm/i915/ddi: Fix HDMI port width programming in DDI_BUF_CTL Fix the port width programming in the DDI_BUF_CTL register on MTLP+, where this had an off-by-one error. Cc: # v6.5+ Fixes: b66a8abaa48a ("drm/i915/display/mtl: Fill port width in DDI_BUF_/TRANS_DDI_FUNC_/PORT_BUF_CTL for HDMI") Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250214142001.552916-3-imre.deak@intel.com (cherry picked from commit b2ecdabe46d23db275f94cd7c46ca414a144818b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 2 +- drivers/gpu/drm/i915/i915_reg.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index acb986bc1f33a..2b9240ab547d8 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -3487,7 +3487,7 @@ static void intel_ddi_enable_hdmi(struct intel_atomic_state *state, intel_de_rmw(dev_priv, XELPDP_PORT_BUF_CTL1(dev_priv, port), XELPDP_PORT_WIDTH_MASK | XELPDP_PORT_REVERSAL, port_buf); - buf_ctl |= DDI_PORT_WIDTH(lane_count); + buf_ctl |= DDI_PORT_WIDTH(crtc_state->lane_count); if (DISPLAY_VER(dev_priv) >= 20) buf_ctl |= XE2LPD_DDI_BUF_D2D_LINK_ENABLE; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 765e6c0528fb0..786c727aea454 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -3633,7 +3633,7 @@ enum skl_power_gate { #define DDI_BUF_IS_IDLE (1 << 7) #define DDI_BUF_CTL_TC_PHY_OWNERSHIP REG_BIT(6) #define DDI_A_4_LANES (1 << 4) -#define DDI_PORT_WIDTH(width) (((width) - 1) << 1) +#define DDI_PORT_WIDTH(width) (((width) == 3 ? 4 : ((width) - 1)) << 1) #define DDI_PORT_WIDTH_MASK (7 << 1) #define DDI_PORT_WIDTH_SHIFT 1 #define DDI_INIT_DISPLAY_DETECTED (1 << 0) -- GitLab From 07fb70d82e0df085980246bf17bc12537588795f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 12 Feb 2025 18:43:21 +0200 Subject: [PATCH 686/989] drm/i915: Make sure all planes in use by the joiner have their crtc included MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Any active plane needs to have its crtc included in the atomic state. For planes enabled via uapi that is all handler in the core. But when we use a plane for joiner the uapi code things the plane is disabled and therefore doesn't have a crtc. So we need to pull those in by hand. We do it first thing in intel_joiner_add_affected_crtcs() so that any newly added crtc will subsequently pull in all of its joined crtcs as well. The symptoms from failing to do this are: - duct tape in the form of commit 1d5b09f8daf8 ("drm/i915: Fix NULL ptr deref by checking new_crtc_state") - the plane's hw state will get overwritten by the disabled uapi state if it can't find the uapi counterpart plane in the atomic state from where it should copy the correct state Cc: stable@vger.kernel.org Reviewed-by: Maarten Lankhorst Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20250212164330.16891-2-ville.syrjala@linux.intel.com (cherry picked from commit 91077d1deb5374eb8be00fb391710f00e751dc4b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 4271da219b410..41128469f12a2 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -6628,12 +6628,30 @@ static int intel_async_flip_check_hw(struct intel_atomic_state *state, struct in static int intel_joiner_add_affected_crtcs(struct intel_atomic_state *state) { struct drm_i915_private *i915 = to_i915(state->base.dev); + const struct intel_plane_state *plane_state; struct intel_crtc_state *crtc_state; + struct intel_plane *plane; struct intel_crtc *crtc; u8 affected_pipes = 0; u8 modeset_pipes = 0; int i; + /* + * Any plane which is in use by the joiner needs its crtc. + * Pull those in first as this will not have happened yet + * if the plane remains disabled according to uapi. + */ + for_each_new_intel_plane_in_state(state, plane, plane_state, i) { + crtc = to_intel_crtc(plane_state->hw.crtc); + if (!crtc) + continue; + + crtc_state = intel_atomic_get_crtc_state(&state->base, crtc); + if (IS_ERR(crtc_state)) + return PTR_ERR(crtc_state); + } + + /* Now pull in all joined crtcs */ for_each_new_intel_crtc_in_state(state, crtc, crtc_state, i) { affected_pipes |= crtc_state->joiner_pipes; if (intel_crtc_needs_modeset(crtc_state)) -- GitLab From dd8b0582e25e36bba483c60338741c0ba5bc426c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 17 Feb 2025 11:16:26 +0800 Subject: [PATCH 687/989] block: fix NULL pointer dereferenced within __blk_rq_map_sg The block layer internal flush request may not have bio attached, so the request iterator has to be initialized from valid req->bio, otherwise NULL pointer dereferenced is triggered. Cc: Christoph Hellwig Reported-and-tested-by: Cheyenne Wills Fixes: b7175e24d6ac ("block: add a dma mapping iterator") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250217031626.461977-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 39b738c0e4c9a..c7c85e10cf9cb 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -556,11 +556,14 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, { struct req_iterator iter = { .bio = rq->bio, - .iter = rq->bio->bi_iter, }; struct phys_vec vec; int nsegs = 0; + /* the internal flush request may not have bio attached */ + if (iter.bio) + iter.iter = iter.bio->bi_iter; + while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, -- GitLab From 290237fde9491ca26cf4020bbf5a2b330452e7db Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Mon, 10 Feb 2025 22:17:29 +1100 Subject: [PATCH 688/989] btrfs: selftests: fix btrfs_test_delayed_refs() leak of transaction The btrfs_transaction struct leaks, which can cause sporadic fstests failures when kmemleak checking is enabled: kmemleak: 5 new suspected memory leaks (see /sys/kernel/debug/kmemleak) > cat /sys/kernel/debug/kmemleak unreferenced object 0xffff88810fdc6c00 (size 512): comm "modprobe", pid 203, jiffies 4294892552 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 6736050f): __kmalloc_cache_noprof+0x133/0x2c0 btrfs_test_delayed_refs+0x6f/0xbb0 [btrfs] btrfs_run_sanity_tests.cold+0x91/0xf9 [btrfs] 0xffffffffa02fd055 do_one_initcall+0x49/0x1c0 do_init_module+0x5b/0x1f0 init_module_from_file+0x70/0x90 idempotent_init_module+0xe8/0x2c0 __x64_sys_finit_module+0x6b/0xd0 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e The transaction struct was initially stack-allocated but switched to heap following frame size compiler warnings. Fixes: 2b34879d97e27 ("btrfs: selftests: add delayed ref self test cases") Signed-off-by: David Disseldorp Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/delayed-refs-tests.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf5..265370e79a546 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; -- GitLab From 4cb77793842a351b39a030f77caebace3524840e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 15 Feb 2025 18:52:41 +0000 Subject: [PATCH 689/989] irqchip/gic-v3: Fix rk3399 workaround when secure interrupts are enabled Christoph reports that their rk3399 system dies since commit 773c05f417fa1 ("irqchip/gic-v3: Work around insecure GIC integrations"). It appears that some rk3399 have secure payloads, and that the firmware sets SCR_EL3.FIQ==1. Obivously, disabling security in that configuration leads to even more problems. Revisit the workaround by: - making it rk3399 specific - checking whether Group-0 is available, which is a good proxy for SCR_EL3.FIQ being 0 - either apply the workaround if Group-0 is available, or disable pseudo-NMIs if not Note that this doesn't mean that the secure side is able to receive interrupts, as all interrupts are made non-secure anyway. Clearly, nobody ever tested secure interrupts on this platform. Fixes: 773c05f417fa1 ("irqchip/gic-v3: Work around insecure GIC integrations") Reported-by: Christoph Fritz Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Christoph Fritz Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250215185241.3768218-1-maz@kernel.org Closes: https://lore.kernel.org/r/b1266652fb64857246e8babdf268d0df8f0c36d9.camel@googlemail.com --- drivers/irqchip/irq-gic-v3.c | 53 +++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 76dce0aac2465..270d7a4d85a6d 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -44,6 +44,7 @@ static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI; #define FLAGS_WORKAROUND_GICR_WAKER_MSM8996 (1ULL << 0) #define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539 (1ULL << 1) #define FLAGS_WORKAROUND_ASR_ERRATUM_8601001 (1ULL << 2) +#define FLAGS_WORKAROUND_INSECURE (1ULL << 3) #define GIC_IRQ_TYPE_PARTITION (GIC_IRQ_TYPE_LPI + 1) @@ -83,6 +84,8 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key); #define GIC_LINE_NR min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U) #define GIC_ESPI_NR GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer) +static bool nmi_support_forbidden; + /* * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs * are potentially stolen by the secure side. Some code, especially code dealing @@ -163,21 +166,27 @@ static void __init gic_prio_init(void) { bool ds; - ds = gic_dist_security_disabled(); - if (!ds) { - u32 val; - - val = readl_relaxed(gic_data.dist_base + GICD_CTLR); - val |= GICD_CTLR_DS; - writel_relaxed(val, gic_data.dist_base + GICD_CTLR); + cpus_have_group0 = gic_has_group0(); - ds = gic_dist_security_disabled(); - if (ds) - pr_warn("Broken GIC integration, security disabled"); + ds = gic_dist_security_disabled(); + if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) { + if (cpus_have_group0) { + u32 val; + + val = readl_relaxed(gic_data.dist_base + GICD_CTLR); + val |= GICD_CTLR_DS; + writel_relaxed(val, gic_data.dist_base + GICD_CTLR); + + ds = gic_dist_security_disabled(); + if (ds) + pr_warn("Broken GIC integration, security disabled\n"); + } else { + pr_warn("Broken GIC integration, pNMI forbidden\n"); + nmi_support_forbidden = true; + } } cpus_have_security_disabled = ds; - cpus_have_group0 = gic_has_group0(); /* * How priority values are used by the GIC depends on two things: @@ -209,7 +218,7 @@ static void __init gic_prio_init(void) * be in the non-secure range, we program the non-secure values into * the distributor to match the PMR values we want. */ - if (cpus_have_group0 & !cpus_have_security_disabled) { + if (cpus_have_group0 && !cpus_have_security_disabled) { dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq); dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi); } @@ -1922,6 +1931,18 @@ static bool gic_enable_quirk_arm64_2941627(void *data) return true; } +static bool gic_enable_quirk_rk3399(void *data) +{ + struct gic_chip_data *d = data; + + if (of_machine_is_compatible("rockchip,rk3399")) { + d->flags |= FLAGS_WORKAROUND_INSECURE; + return true; + } + + return false; +} + static bool rd_set_non_coherent(void *data) { struct gic_chip_data *d = data; @@ -1996,6 +2017,12 @@ static const struct gic_quirk gic_quirks[] = { .property = "dma-noncoherent", .init = rd_set_non_coherent, }, + { + .desc = "GICv3: Insecure RK3399 integration", + .iidr = 0x0000043b, + .mask = 0xff000fff, + .init = gic_enable_quirk_rk3399, + }, { } }; @@ -2004,7 +2031,7 @@ static void gic_enable_nmi_support(void) { int i; - if (!gic_prio_masking_enabled()) + if (!gic_prio_masking_enabled() || nmi_support_forbidden) return; rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR, -- GitLab From d7e3fd658248f257006227285095d190e70ee73a Mon Sep 17 00:00:00 2001 From: Artur Rojek Date: Sun, 16 Feb 2025 18:55:45 +0100 Subject: [PATCH 690/989] irqchip/jcore-aic, clocksource/drivers/jcore: Fix jcore-pit interrupt request The jcore-aic irqchip does not have separate interrupt numbers reserved for cpu-local vs global interrupts. Therefore the device drivers need to request the given interrupt as per CPU interrupt. 69a9dcbd2d65 ("clocksource/drivers/jcore: Use request_percpu_irq()") converted the clocksource driver over to request_percpu_irq(), but failed to do add all the required changes, resulting in a failure to register PIT interrupts. Fix this by: 1) Explicitly mark the interrupt via irq_set_percpu_devid() in jcore_pit_init(). 2) Enable and disable the per CPU interrupt in the CPU hotplug callbacks. 3) Pass the correct per-cpu cookie to the irq handler by using handle_percpu_devid_irq() instead of handle_percpu_irq() in handle_jcore_irq(). [ tglx: Massage change log ] Fixes: 69a9dcbd2d65 ("clocksource/drivers/jcore: Use request_percpu_irq()") Signed-off-by: Artur Rojek Signed-off-by: Thomas Gleixner Acked-by: Uros Bizjak Link: https://lore.kernel.org/all/20250216175545.35079-3-contact@artur-rojek.eu --- drivers/clocksource/jcore-pit.c | 15 ++++++++++++++- drivers/irqchip/irq-jcore-aic.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c index a3fe98cd38382..82815428f8f92 100644 --- a/drivers/clocksource/jcore-pit.c +++ b/drivers/clocksource/jcore-pit.c @@ -114,6 +114,18 @@ static int jcore_pit_local_init(unsigned cpu) pit->periodic_delta = DIV_ROUND_CLOSEST(NSEC_PER_SEC, HZ * buspd); clockevents_config_and_register(&pit->ced, freq, 1, ULONG_MAX); + enable_percpu_irq(pit->ced.irq, IRQ_TYPE_NONE); + + return 0; +} + +static int jcore_pit_local_teardown(unsigned cpu) +{ + struct jcore_pit *pit = this_cpu_ptr(jcore_pit_percpu); + + pr_info("Local J-Core PIT teardown on cpu %u\n", cpu); + + disable_percpu_irq(pit->ced.irq); return 0; } @@ -168,6 +180,7 @@ static int __init jcore_pit_init(struct device_node *node) return -ENOMEM; } + irq_set_percpu_devid(pit_irq); err = request_percpu_irq(pit_irq, jcore_timer_interrupt, "jcore_pit", jcore_pit_percpu); if (err) { @@ -237,7 +250,7 @@ static int __init jcore_pit_init(struct device_node *node) cpuhp_setup_state(CPUHP_AP_JCORE_TIMER_STARTING, "clockevents/jcore:starting", - jcore_pit_local_init, NULL); + jcore_pit_local_init, jcore_pit_local_teardown); return 0; } diff --git a/drivers/irqchip/irq-jcore-aic.c b/drivers/irqchip/irq-jcore-aic.c index b9dcc8e78c750..1f613eb7b7f03 100644 --- a/drivers/irqchip/irq-jcore-aic.c +++ b/drivers/irqchip/irq-jcore-aic.c @@ -38,7 +38,7 @@ static struct irq_chip jcore_aic; static void handle_jcore_irq(struct irq_desc *desc) { if (irqd_is_per_cpu(irq_desc_get_irq_data(desc))) - handle_percpu_irq(desc); + handle_percpu_devid_irq(desc); else handle_simple_irq(desc); } -- GitLab From 07b598c0e6f06a0f254c88dafb4ad50f8a8c6eea Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 13 Feb 2025 15:20:55 +0000 Subject: [PATCH 691/989] drop_monitor: fix incorrect initialization order Syzkaller reports the following bug: BUG: spinlock bad magic on CPU#1, syz-executor.0/7995 lock: 0xffff88805303f3e0, .magic: 00000000, .owner: /-1, .owner_cpu: 0 CPU: 1 PID: 7995 Comm: syz-executor.0 Tainted: G E 5.10.209+ #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x119/0x179 lib/dump_stack.c:118 debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline] do_raw_spin_lock+0x1f6/0x270 kernel/locking/spinlock_debug.c:112 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:117 [inline] _raw_spin_lock_irqsave+0x50/0x70 kernel/locking/spinlock.c:159 reset_per_cpu_data+0xe6/0x240 [drop_monitor] net_dm_cmd_trace+0x43d/0x17a0 [drop_monitor] genl_family_rcv_msg_doit+0x22f/0x330 net/netlink/genetlink.c:739 genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0x341/0x5a0 net/netlink/genetlink.c:800 netlink_rcv_skb+0x14d/0x440 net/netlink/af_netlink.c:2497 genl_rcv+0x29/0x40 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x54b/0x800 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x914/0xe00 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:651 [inline] __sock_sendmsg+0x157/0x190 net/socket.c:663 ____sys_sendmsg+0x712/0x870 net/socket.c:2378 ___sys_sendmsg+0xf8/0x170 net/socket.c:2432 __sys_sendmsg+0xea/0x1b0 net/socket.c:2461 do_syscall_64+0x30/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x62/0xc7 RIP: 0033:0x7f3f9815aee9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3f972bf0c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f3f9826d050 RCX: 00007f3f9815aee9 RDX: 0000000020000000 RSI: 0000000020001300 RDI: 0000000000000007 RBP: 00007f3f981b63bd R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000006e R14: 00007f3f9826d050 R15: 00007ffe01ee6768 If drop_monitor is built as a kernel module, syzkaller may have time to send a netlink NET_DM_CMD_START message during the module loading. This will call the net_dm_monitor_start() function that uses a spinlock that has not yet been initialized. To fix this, let's place resource initialization above the registration of a generic netlink family. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250213152054.2785669-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Jakub Kicinski --- net/core/drop_monitor.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9ddd..212f0a048cab6 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); -- GitLab From 915e34d5ad35a6a9e56113f852ade4a730fb88f0 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Fri, 14 Feb 2025 13:01:37 +0100 Subject: [PATCH 692/989] s390/ism: add release function for struct device According to device_release() in /drivers/base/core.c, a device without a release function is a broken device and must be fixed. The current code directly frees the device after calling device_add() without waiting for other kernel parts to release their references. Thus, a reference could still be held to a struct device, e.g., by sysfs, leading to potential use-after-free issues if a proper release function is not set. Fixes: 8c81ba20349d ("net/smc: De-tangle ism and smc device initialization") Reviewed-by: Alexandra Winter Reviewed-by: Wenjia Zhang Signed-off-by: Julian Ruess Signed-off-by: Alexandra Winter Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250214120137.563409-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/ism_drv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e36e3ea165d3b..2f34761e64135 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -588,6 +588,15 @@ static int ism_dev_init(struct ism_dev *ism) return ret; } +static void ism_dev_release(struct device *dev) +{ + struct ism_dev *ism; + + ism = container_of(dev, struct ism_dev, dev); + + kfree(ism); +} + static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct ism_dev *ism; @@ -601,6 +610,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; ism->dev.parent = &pdev->dev; + ism->dev.release = ism_dev_release; device_initialize(&ism->dev); dev_set_name(&ism->dev, dev_name(&pdev->dev)); ret = device_add(&ism->dev); @@ -637,7 +647,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - kfree(ism); + put_device(&ism->dev); return ret; } @@ -682,7 +692,7 @@ static void ism_remove(struct pci_dev *pdev) pci_disable_device(pdev); device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - kfree(ism); + put_device(&ism->dev); } static struct pci_driver ism_driver = { -- GitLab From bdf5d13aa05ec314d4385b31ac974d6c7e0997c9 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Fri, 14 Feb 2025 09:52:33 -0600 Subject: [PATCH 693/989] ibmvnic: Don't reference skb after sending to VIOS Previously, after successfully flushing the xmit buffer to VIOS, the tx_bytes stat was incremented by the length of the skb. It is invalid to access the skb memory after sending the buffer to the VIOS because, at any point after sending, the VIOS can trigger an interrupt to free this memory. A race between reading skb->len and freeing the skb is possible (especially during LPM) and will result in use-after-free: ================================================================== BUG: KASAN: slab-use-after-free in ibmvnic_xmit+0x75c/0x1808 [ibmvnic] Read of size 4 at addr c00000024eb48a70 by task hxecom/14495 <...> Call Trace: [c000000118f66cf0] [c0000000018cba6c] dump_stack_lvl+0x84/0xe8 (unreliable) [c000000118f66d20] [c0000000006f0080] print_report+0x1a8/0x7f0 [c000000118f66df0] [c0000000006f08f0] kasan_report+0x128/0x1f8 [c000000118f66f00] [c0000000006f2868] __asan_load4+0xac/0xe0 [c000000118f66f20] [c0080000046eac84] ibmvnic_xmit+0x75c/0x1808 [ibmvnic] [c000000118f67340] [c0000000014be168] dev_hard_start_xmit+0x150/0x358 <...> Freed by task 0: kasan_save_stack+0x34/0x68 kasan_save_track+0x2c/0x50 kasan_save_free_info+0x64/0x108 __kasan_mempool_poison_object+0x148/0x2d4 napi_skb_cache_put+0x5c/0x194 net_tx_action+0x154/0x5b8 handle_softirqs+0x20c/0x60c do_softirq_own_stack+0x6c/0x88 <...> The buggy address belongs to the object at c00000024eb48a00 which belongs to the cache skbuff_head_cache of size 224 ================================================================== Fixes: 032c5e82847a ("Driver for IBM System i/p VNIC protocol") Signed-off-by: Nick Child Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250214155233.235559-1-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index e95ae0d39948c..0676fc547b6f4 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2408,6 +2408,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) dma_addr_t data_dma_addr; struct netdev_queue *txq; unsigned long lpar_rc; + unsigned int skblen; union sub_crq tx_crq; unsigned int offset; bool use_scrq_send_direct = false; @@ -2522,6 +2523,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_buff->skb = skb; tx_buff->index = bufidx; tx_buff->pool_index = queue_num; + skblen = skb->len; memset(&tx_crq, 0, sizeof(tx_crq)); tx_crq.v1.first = IBMVNIC_CRQ_CMD; @@ -2614,7 +2616,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) netif_stop_subqueue(netdev, queue_num); } - tx_bytes += skb->len; + tx_bytes += skblen; txq_trans_cond_update(txq); ret = NETDEV_TX_OK; goto out; -- GitLab From 0a4f598c84fc0eeb143ba03cdd3fc3d857061c3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 15 Feb 2025 14:52:00 -0800 Subject: [PATCH 694/989] MAINTAINERS: create entry for ethtool MAC merge Vladimir implemented the MAC merge support and reviews all the new driver implementations. Acked-by: Vladimir Oltean Link: https://patch.msgid.link/20250215225200.2652212-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 988b0ff94fda9..1405ebe703a8f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16477,6 +16477,12 @@ F: net/ethtool/cabletest.c F: tools/testing/selftests/drivers/net/*/ethtool* K: cable_test +NETWORKING [ETHTOOL MAC MERGE] +M: Vladimir Oltean +F: net/ethtool/mm.c +F: tools/testing/selftests/drivers/net/hw/ethtool_mm.sh +K: ethtool_mm + NETWORKING [GENERAL] M: "David S. Miller" M: Eric Dumazet -- GitLab From c8a3e63ff9d75b9f3f031c90d218876051dea0ba Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 29 Jan 2025 14:20:03 -0800 Subject: [PATCH 695/989] procfs: fix a locking bug in a vmcore_add_device_dump() error path Unlock vmcore_mutex when returning -EBUSY. Link: https://lkml.kernel.org/r/20250129222003.1495713-1-bvanassche@acm.org Fixes: 0f3b1c40c652 ("fs/proc/vmcore: disallow vmcore modifications while the vmcore is open") Signed-off-by: Bart Van Assche Acked-by: Michael S. Tsirkin Acked-by: David Hildenbrand Cc: Baoquan he Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a00120a3c0994..10d01eb09c43d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1524,7 +1524,7 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) pr_warn_once("Unexpected adding of device dump\n"); if (vmcore_open) { ret = -EBUSY; - goto out_err; + goto unlock; } list_add_tail(&dump->list, &vmcoredd_list); @@ -1532,6 +1532,9 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) mutex_unlock(&vmcore_mutex); return 0; +unlock: + mutex_unlock(&vmcore_mutex); + out_err: vfree(buf); vfree(dump); -- GitLab From f4b78260fc678ccd7169f32dc9f3bfa3b93931c7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 31 Jan 2025 14:13:15 +0000 Subject: [PATCH 696/989] lib/iov_iter: fix import_iovec_ubuf iovec management import_iovec() says that it should always be fine to kfree the iovec returned in @iovp regardless of the error code. __import_iovec_ubuf() never reallocates it and thus should clear the pointer even in cases when copy_iovec_*() fail. Link: https://lkml.kernel.org/r/378ae26923ffc20fd5e41b4360d673bf47b1775b.1738332461.git.asml.silence@gmail.com Fixes: 3b2deb0e46da ("iov_iter: import single vector iovecs as ITER_UBUF") Signed-off-by: Pavel Begunkov Reviewed-by: Jens Axboe Cc: Al Viro Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- lib/iov_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9ec806f989f25..65f550cb5081b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1428,6 +1428,8 @@ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec *iov = *iovp; ssize_t ret; + *iovp = NULL; + if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else @@ -1438,7 +1440,6 @@ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; - *iovp = NULL; return i->count; } -- GitLab From 63895d20d63b446f5049a963983489319c2ea3e2 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 29 Jan 2025 19:08:44 +0900 Subject: [PATCH 697/989] mm/zswap: fix inconsistency when zswap_store_page() fails Commit b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") skips charging any zswap entries when it failed to zswap the entire folio. However, when some base pages are zswapped but it failed to zswap the entire folio, the zswap operation is rolled back. When freeing zswap entries for those pages, zswap_entry_free() uncharges the zswap entries that were not previously charged, causing zswap charging to become inconsistent. This inconsistency triggers two warnings with following steps: # On a machine with 64GiB of RAM and 36GiB of zswap $ stress-ng --bigheap 2 # wait until the OOM-killer kills stress-ng $ sudo reboot The two warnings are: in mm/memcontrol.c:163, function obj_cgroup_release(): WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); in mm/page_counter.c:60, function page_counter_cancel(): if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) zswap_stored_pages also becomes inconsistent in the same way. As suggested by Kanchana, increment zswap_stored_pages and charge zswap entries within zswap_store_page() when it succeeds. This way, zswap_entry_free() will decrement the counter and uncharge the entries when it failed to zswap the entire folio. While this could potentially be optimized by batching objcg charging and incrementing the counter, let's focus on fixing the bug this time and leave the optimization for later after some evaluation. After resolving the inconsistency, the warnings disappear. [42.hyeyoo@gmail.com: refactor zswap_store_page()] Link: https://lkml.kernel.org/r/20250131082037.2426-1-42.hyeyoo@gmail.com Link: https://lkml.kernel.org/r/20250129100844.2935-1-42.hyeyoo@gmail.com Fixes: b7c0ccdfbafd ("mm: zswap: support large folios in zswap_store()") Co-developed-by: Kanchana P Sridhar Signed-off-by: Kanchana P Sridhar Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Yosry Ahmed Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton --- mm/zswap.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 6504174fbc6ad..ac9d299e7d0c1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1445,9 +1445,9 @@ static void shrink_worker(struct work_struct *w) * main API **********************************/ -static ssize_t zswap_store_page(struct page *page, - struct obj_cgroup *objcg, - struct zswap_pool *pool) +static bool zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; @@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page, entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - return -EINVAL; + return false; } if (!zswap_compress(page, entry, pool)) @@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page, /* * The entry is successfully compressed and stored in the tree, there is - * no further possibility of failure. Grab refs to the pool and objcg. - * These refs will be dropped by zswap_entry_free() when the entry is - * removed from the tree. + * no further possibility of failure. Grab refs to the pool and objcg, + * charge zswap memory, and increment zswap_stored_pages. + * The opposite actions will be performed by zswap_entry_free() + * when the entry is removed from the tree. */ zswap_pool_get(pool); - if (objcg) + if (objcg) { obj_cgroup_get(objcg); + obj_cgroup_charge_zswap(objcg, entry->length); + } + atomic_long_inc(&zswap_stored_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page, zswap_lru_add(&zswap_list_lru, entry); } - return entry->length; + return true; store_failed: zpool_free(pool->zpool, entry->handle); compress_failed: zswap_entry_cache_free(entry); - return -EINVAL; + return false; } bool zswap_store(struct folio *folio) @@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio) struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg = NULL; struct zswap_pool *pool; - size_t compressed_bytes = 0; bool ret = false; long index; @@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio) for (index = 0; index < nr_pages; ++index) { struct page *page = folio_page(folio, index); - ssize_t bytes; - bytes = zswap_store_page(page, objcg, pool); - if (bytes < 0) + if (!zswap_store_page(page, objcg, pool)) goto put_pool; - compressed_bytes += bytes; } - if (objcg) { - obj_cgroup_charge_zswap(objcg, compressed_bytes); + if (objcg) count_objcg_events(objcg, ZSWPOUT, nr_pages); - } - atomic_long_add(nr_pages, &zswap_stored_pages); count_vm_events(ZSWPOUT, nr_pages); ret = true; -- GitLab From 2ede647a6fde3e54a6bfda7cf01c716649655900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Ca=C3=B1uelo=20Navarro?= Date: Mon, 3 Feb 2025 08:52:06 +0100 Subject: [PATCH 698/989] mm,madvise,hugetlb: check for 0-length range after end address adjustment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a sanity check to madvise_dontneed_free() to address a corner case in madvise where a race condition causes the current vma being processed to be backed by a different page size. During a madvise(MADV_DONTNEED) call on a memory region registered with a userfaultfd, there's a period of time where the process mm lock is temporarily released in order to send a UFFD_EVENT_REMOVE and let userspace handle the event. During this time, the vma covering the current address range may change due to an explicit mmap done concurrently by another thread. If, after that change, the memory region, which was originally backed by 4KB pages, is now backed by hugepages, the end address is rounded down to a hugepage boundary to avoid data loss (see "Fixes" below). This rounding may cause the end address to be truncated to the same address as the start. Make this corner case follow the same semantics as in other similar cases where the requested region has zero length (ie. return 0). This will make madvise_walk_vmas() continue to the next vma in the range (this time holding the process mm lock) which, due to the prev pointer becoming stale because of the vma change, will be the same hugepage-backed vma that was just checked before. The next time madvise_dontneed_free() runs for this vma, if the start address isn't aligned to a hugepage boundary, it'll return -EINVAL, which is also in line with the madvise api. From userspace perspective, madvise() will return EINVAL because the start address isn't aligned according to the new vma alignment requirements (hugepage), even though it was correctly page-aligned when the call was issued. Link: https://lkml.kernel.org/r/20250203075206.1452208-1-rcn@igalia.com Fixes: 8ebe0a5eaaeb ("mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs") Signed-off-by: Ricardo Cañuelo Navarro Reviewed-by: Oscar Salvador Cc: Florent Revest Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 49f3a75046f63..08b207f8e61ef 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) -- GitLab From 639375b0aa4323fe59b5fe2a6ebc68b022c36f50 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 5 Feb 2025 12:01:00 -0800 Subject: [PATCH 699/989] .mailmap: add entries for Jeff Johnson Map past iterations of my e-mail addresses to the current one. Link: https://lkml.kernel.org/r/20250205-jjohnson-mailmap-v1-1-269cb7b1710d@oss.qualcomm.com Signed-off-by: Jeff Johnson Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index fedebf86640ad..c96dbe259c228 100644 --- a/.mailmap +++ b/.mailmap @@ -317,6 +317,8 @@ Jayachandran C Jean Tourrilhes Jeevan Shriram Jeff Garzik +Jeff Johnson +Jeff Johnson Jeff Layton Jeff Layton Jeff Layton -- GitLab From 3219585e894c12cbffd4ac93d3e6783d236f146e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 5 Feb 2025 14:04:57 +0800 Subject: [PATCH 700/989] mailmap: add entry for Feng Tang Map my old business email to personal email. Link: https://lkml.kernel.org/r/20250205060457.53667-1-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index c96dbe259c228..f34af946180f8 100644 --- a/.mailmap +++ b/.mailmap @@ -226,6 +226,7 @@ Fangrui Song Felipe W Damasio Felix Kuhling Felix Moeller +Feng Tang Fenglin Wu Filipe Lautert Finn Thain -- GitLab From 035d3c778709680288b3954ee896043132bc3f8d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 10 Feb 2025 12:05:18 -0800 Subject: [PATCH 701/989] tools/mm: fix build warnings with musl-libc musl-libc warns about the following: /home/florian/dev/buildroot/output/arm64/rpi4-b/host/aarch64-buildroot-linux-musl/sysroot/usr/include/sys/errno.h:1:2: attention: #warning redirecting incorrect #include to [-Wcpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ /home/florian/dev/buildroot/output/arm64/rpi4-b/host/aarch64-buildroot-linux-musl/sysroot/usr/include/sys/fcntl.h:1:2: attention: #warning redirecting incorrect #include to [-Wcpp] 1 | #warning redirecting incorrect #include to | ^~~~~~~ include errno.h and fcntl.h directly. Link: https://lkml.kernel.org/r/20250210200518.1137295-1-florian.fainelli@broadcom.com Signed-off-by: Florian Fainelli Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index bcac7ebfb51fd..d7e5e8902af86 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -24,8 +24,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include -- GitLab From 41cddf83d8b00f29fd105e7a0777366edc69a5cf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 10 Feb 2025 17:13:17 +0100 Subject: [PATCH 702/989] mm/migrate_device: don't add folio to be freed to LRU in migrate_device_finalize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If migration succeeded, we called folio_migrate_flags()->mem_cgroup_migrate() to migrate the memcg from the old to the new folio. This will set memcg_data of the old folio to 0. Similarly, if migration failed, memcg_data of the dst folio is left unset. If we call folio_putback_lru() on such folios (memcg_data == 0), we will add the folio to be freed to the LRU, making memcg code unhappy. Running the hmm selftests: # ./hmm-tests ... # RUN hmm.hmm_device_private.migrate ... [ 102.078007][T14893] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x7ff27d200 pfn:0x13cc00 [ 102.079974][T14893] anon flags: 0x17ff00000020018(uptodate|dirty|swapbacked|node=0|zone=2|lastcpupid=0x7ff) [ 102.082037][T14893] raw: 017ff00000020018 dead000000000100 dead000000000122 ffff8881353896c9 [ 102.083687][T14893] raw: 00000007ff27d200 0000000000000000 00000001ffffffff 0000000000000000 [ 102.085331][T14893] page dumped because: VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled()) [ 102.087230][T14893] ------------[ cut here ]------------ [ 102.088279][T14893] WARNING: CPU: 0 PID: 14893 at ./include/linux/memcontrol.h:726 folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.090478][T14893] Modules linked in: [ 102.091244][T14893] CPU: 0 UID: 0 PID: 14893 Comm: hmm-tests Not tainted 6.13.0-09623-g6c216bc522fd #151 [ 102.093089][T14893] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 [ 102.094848][T14893] RIP: 0010:folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.096104][T14893] Code: ... [ 102.099908][T14893] RSP: 0018:ffffc900236c37b0 EFLAGS: 00010293 [ 102.101152][T14893] RAX: 0000000000000000 RBX: ffffea0004f30000 RCX: ffffffff8183f426 [ 102.102684][T14893] RDX: ffff8881063cb880 RSI: ffffffff81b8117f RDI: ffff8881063cb880 [ 102.104227][T14893] RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000 [ 102.105757][T14893] R10: 0000000000000001 R11: 0000000000000002 R12: ffffc900236c37d8 [ 102.107296][T14893] R13: ffff888277a2bcb0 R14: 000000000000001f R15: 0000000000000000 [ 102.108830][T14893] FS: 00007ff27dbdd740(0000) GS:ffff888277a00000(0000) knlGS:0000000000000000 [ 102.110643][T14893] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 102.111924][T14893] CR2: 00007ff27d400000 CR3: 000000010866e000 CR4: 0000000000750ef0 [ 102.113478][T14893] PKRU: 55555554 [ 102.114172][T14893] Call Trace: [ 102.114805][T14893] [ 102.115397][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.116547][T14893] ? __warn.cold+0x110/0x210 [ 102.117461][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.118667][T14893] ? report_bug+0x1b9/0x320 [ 102.119571][T14893] ? handle_bug+0x54/0x90 [ 102.120494][T14893] ? exc_invalid_op+0x17/0x50 [ 102.121433][T14893] ? asm_exc_invalid_op+0x1a/0x20 [ 102.122435][T14893] ? __wake_up_klogd.part.0+0x76/0xd0 [ 102.123506][T14893] ? dump_page+0x4f/0x60 [ 102.124352][T14893] ? folio_lruvec_lock_irqsave+0x10e/0x170 [ 102.125500][T14893] folio_batch_move_lru+0xd4/0x200 [ 102.126577][T14893] ? __pfx_lru_add+0x10/0x10 [ 102.127505][T14893] __folio_batch_add_and_move+0x391/0x720 [ 102.128633][T14893] ? __pfx_lru_add+0x10/0x10 [ 102.129550][T14893] folio_putback_lru+0x16/0x80 [ 102.130564][T14893] migrate_device_finalize+0x9b/0x530 [ 102.131640][T14893] dmirror_migrate_to_device.constprop.0+0x7c5/0xad0 [ 102.133047][T14893] dmirror_fops_unlocked_ioctl+0x89b/0xc80 Likely, nothing else goes wrong: putting the last folio reference will remove the folio from the LRU again. So besides memcg complaining, adding the folio to be freed to the LRU is just an unnecessary step. The new flow resembles what we have in migrate_folio_move(): add the dst to the lru, remove migration ptes, unlock and unref dst. Link: https://lkml.kernel.org/r/20250210161317.717936-1-david@redhat.com Fixes: 8763cb45ab96 ("mm/migrate: new memory migration helper for use with device memory") Signed-off-by: David Hildenbrand Cc: Jérôme Glisse Cc: John Hubbard Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/migrate_device.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 9cf26592ac934..5bd888223cc8b 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns, dst = src; } + if (!folio_is_zone_device(dst)) + folio_add_lru(dst); remove_migration_ptes(src, dst, 0); folio_unlock(src); - - if (folio_is_zone_device(src)) - folio_put(src); - else - folio_putback_lru(src); + folio_put(src); if (dst != src) { folio_unlock(dst); - if (folio_is_zone_device(dst)) - folio_put(dst); - else - folio_putback_lru(dst); + folio_put(dst); } } } -- GitLab From 2272dbc471037b78f308b44351ab1b9f88d32628 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Sat, 8 Feb 2025 14:44:00 +0800 Subject: [PATCH 703/989] getdelays: fix error format characters getdelays had a compilation issue because the format string was not updated when the "delay min" was added. For example, after adding the "delay min" in printf, there were 7 strings but only 6 "%s" format specifiers. Similarly, after adding the 't->cpu_delay_total', there were 7 variables but only 6 format characters specifiers, causing compilation issues as follows. This commit fixes these issues to ensure that getdelays compiles correctly. root@xx:~/linux-next/tools/accounting$ make getdelays.c:199:9: warning: format `%llu' expects argument of type `long long unsigned int', but argument 8 has type `char *' [-Wformat=] 199 | printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ..... 216 | "delay total", "delay average", "delay max", "delay min", | ~~~~~~~~~~~ | | | char * getdelays.c:200:21: note: format string is defined here 200 | " %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n" | ~~~~~^ | | | long long unsigned int | %15s getdelays.c:199:9: warning: format `%f' expects argument of type `double', but argument 12 has type `long long unsigned int' [-Wformat=] 199 | printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ..... 220 | (unsigned long long)t->cpu_delay_total, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | long long unsigned int ..... Link: https://lkml.kernel.org/r/20250208144400544RduNRhwIpT3m2JyRBqskZ@zte.com.cn Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak") Reviewed-by: xu xin Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: Qiang Tu Cc: wangyong Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- tools/accounting/getdelays.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 100ad3dc091a2..3feac0482fe90 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -196,22 +196,22 @@ static int get_family_id(int sd) static void print_delayacct(struct taskstats *t) { - printf("\n\nCPU %15s%15s%15s%15s%15s%15s\n" - " %15llu%15llu%15llu%15llu%15.3fms%13.6fms\n" - "IO %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "SWAP %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "RECLAIM %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "THRASHING%12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "COMPACT %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "WPCOPY %12s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n" - "IRQ %15s%15s%15s%15s\n" - " %15llu%15llu%15.3fms%13.6fms\n", + printf("\n\nCPU %15s%15s%15s%15s%15s%15s%15s\n" + " %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "IO %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "SWAP %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "RECLAIM %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "THRASHING%12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "COMPACT %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "WPCOPY %12s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n" + "IRQ %15s%15s%15s%15s%15s\n" + " %15llu%15llu%15.3fms%13.6fms%13.6fms\n", "count", "real total", "virtual total", "delay total", "delay average", "delay max", "delay min", (unsigned long long)t->cpu_count, -- GitLab From b016d0873777462e55af4c615104cc684fce086d Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Sat, 8 Feb 2025 14:49:01 +0800 Subject: [PATCH 704/989] taskstats: modify taskstats version After adding "delay max" and "delay min" to the taskstats structure, the taskstats version needs to be updated. Link: https://lkml.kernel.org/r/20250208144901218Q5ptVpqsQkb2MOEmW4Ujn@zte.com.cn Fixes: f65c64f311ee ("delayacct: add delay min to record delay peak") Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Reviewed-by: xu xin Signed-off-by: Andrew Morton --- include/uapi/linux/taskstats.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 934e20ef7f793..95762232e0186 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 14 +#define TASKSTATS_VERSION 15 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ -- GitLab From f39edcf6349abb2ca2df96acc8645f4d2631d0a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Tue, 11 Feb 2025 15:26:25 +0800 Subject: [PATCH 705/989] mm: pgtable: fix incorrect reclaim of non-empty PTE pages In zap_pte_range(), if the pte lock was released midway, the pte entries may be refilled with physical pages by another thread, which may cause a non-empty PTE page to be reclaimed and eventually cause the system to crash. To fix it, fall back to the slow path in this case to recheck if all pte entries are still none. Link: https://lkml.kernel.org/r/20250211072625.89188-1-zhengqi.arch@bytedance.com Fixes: 6375e95f381e ("mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED)") Signed-off-by: Qi Zheng Reported-by: Christian Brauner Closes: https://lore.kernel.org/all/20250207-anbot-bankfilialen-acce9d79a2c7@brauner/ Reported-by: Qu Wenruo Closes: https://lore.kernel.org/all/152296f3-5c81-4a94-97f3-004108fba7be@gmx.com/ Tested-by: Zi Yan Cc: Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Hildenbrand Cc: Jann Horn Cc: Matthew Wilcox Cc: Muchun Song Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 539c0f7c6d545..b4d3d4893267c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1719,7 +1719,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); - bool direct_reclaim = false; + bool direct_reclaim = true; int nr; retry: @@ -1734,8 +1734,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, do { bool any_skipped = false; - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; + } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); @@ -1743,11 +1745,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; + direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); - if (can_reclaim_pt && addr == end) + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); -- GitLab From 8648ee2622aefa5b567ebea71609822373995f37 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 11 Feb 2025 13:21:17 -0800 Subject: [PATCH 706/989] mailmap: update Nick's entry Link: https://lkml.kernel.org/r/20250211212117.3195265-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index f34af946180f8..a897c16d3baef 100644 --- a/.mailmap +++ b/.mailmap @@ -534,6 +534,7 @@ Nicholas Piggin Nicholas Piggin Nicholas Piggin Nicholas Piggin +Nick Desaulniers Nicolas Ferre Nicolas Pitre Nicolas Pitre -- GitLab From 99333229dee41b992f3b0493f6aa2e3528138384 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 11 Feb 2025 08:18:19 +0000 Subject: [PATCH 707/989] memcg: avoid dead loop when setting memory.max A softlockup issue was found with stress test: watchdog: BUG: soft lockup - CPU#27 stuck for 26s! [migration/27:181] CPU: 27 UID: 0 PID: 181 Comm: migration/27 6.14.0-rc2-next-20250210 #1 Stopper: multi_cpu_stop <- stop_machine_from_inactive_cpu RIP: 0010:stop_machine_yield+0x2/0x10 RSP: 0000:ff4a0dcecd19be48 EFLAGS: 00000246 RAX: ffffffff89c0108f RBX: ff4a0dcec03afe44 RCX: 0000000000000000 RDX: ff1cdaaf6eba5808 RSI: 0000000000000282 RDI: ff1cda80c1775a40 RBP: 0000000000000001 R08: 00000011620096c6 R09: 7fffffffffffffff R10: 0000000000000001 R11: 0000000000000100 R12: ff1cda80c1775a40 R13: 0000000000000000 R14: 0000000000000001 R15: ff4a0dcec03afe20 FS: 0000000000000000(0000) GS:ff1cdaaf6eb80000(0000) CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000025e2c2a001 CR4: 0000000000773ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: multi_cpu_stop+0x8f/0x100 cpu_stopper_thread+0x90/0x140 smpboot_thread_fn+0xad/0x150 kthread+0xc2/0x100 ret_from_fork+0x2d/0x50 The stress test involves CPU hotplug operations and memory control group (memcg) operations. The scenario can be described as follows: echo xx > memory.max cache_ap_online oom_reaper (CPU23) (CPU50) xx < usage stop_machine_from_inactive_cpu for(;;) // all active cpus trigger OOM queue_stop_cpus_work // waiting oom_reaper multi_cpu_stop(migration/xx) // sync all active cpus ack // waiting cpu23 ack // CPU50 loops in multi_cpu_stop waiting cpu50 Detailed explanation: 1. When the usage is larger than xx, an OOM may be triggered. If the process does not handle with ths kill signal immediately, it will loop in the memory_max_write. 2. When cache_ap_online is triggered, the multi_cpu_stop is queued to the active cpus. Within the multi_cpu_stop function, it attempts to synchronize the CPU states. However, the CPU23 didn't acknowledge because it is stuck in a loop within the for(;;). 3. The oom_reaper process is blocked because CPU50 is in a loop, waiting for CPU23 to acknowledge the synchronization request. 4. Finally, it formed cyclic dependency and lead to softlockup and dead loop. To fix this issue, add cond_resched() in the memory_max_write, so that it will not block migration task. Link: https://lkml.kernel.org/r/20250211081819.33307-1-chenridong@huaweicloud.com Fixes: b6e6edcfa405 ("mm: memcontrol: reclaim and OOM kill when shrinking memory.max below usage") Signed-off-by: Chen Ridong Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: Wang Weiyang Signed-off-by: Andrew Morton --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46f8b372d212b..4de6acb9b8ecb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4166,6 +4166,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; + cond_resched(); } memcg_wb_domain_size_changed(memcg); -- GitLab From 6d7bc938adca9024a6b51cf55d9b0542b653b69c Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 10 Feb 2025 22:48:56 -0500 Subject: [PATCH 708/989] mm: hugetlb: avoid fallback for specific node allocation of 1G pages When using the HugeTLB kernel command-line to allocate 1G pages from a specific node, such as: default_hugepagesz=1G hugepages=1:1 If node 1 happens to not have enough memory for the requested number of 1G pages, the allocation falls back to other nodes. A quick way to reproduce this is by creating a KVM guest with a memory-less node and trying to allocate 1 1G page from it. Instead of failing, the allocation will fallback to other nodes. This defeats the purpose of node specific allocation. Also, specific node allocation for 2M pages don't have this behavior: the allocation will just fail for the pages it can't satisfy. This issue happens because HugeTLB calls memblock_alloc_try_nid_raw() for 1G boot-time allocation as this function falls back to other nodes if the allocation can't be satisfied. Use memblock_alloc_exact_nid_raw() instead, which ensures that the allocation will only be satisfied from the specified node. Link: https://lkml.kernel.org/r/20250211034856.629371-1-luizcap@redhat.com Fixes: b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Luiz Capitulino Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: "Mike Rapoport (IBM)" Cc: Muchun Song Cc: Zhenguo Yao Cc: Frank van der Linden Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460a..163190e89ea16 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3145,7 +3145,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; -- GitLab From 5dcf52e2ce0fe3c4516b1e494c1af6d3a69e30e7 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 12 Feb 2025 17:44:25 +0000 Subject: [PATCH 709/989] selftests/mm: fix check for running THP tests When testing if we should try to compact memory or drop caches before we run the THP or HugeTLB tests we use | as an or operator. This doesn't work since run_vmtests.sh is written in shell where this is used to pipe the output of the first argument into the second. Instead use the shell's -o operator. Link: https://lkml.kernel.org/r/20250212-kselftest-mm-no-hugepages-v1-1-44702f538522@kernel.org Fixes: b433ffa8dbac ("selftests: mm: perform some system cleanup before using hugepages") Signed-off-by: Mark Brown Reviewed-by: Nico Pache Cc: Mariano Pache Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 333c468c26991..da7e266681031 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -220,7 +220,7 @@ run_test() { if test_selected ${CATEGORY}; then # On memory constrainted systems some tests can fail to allocate hugepages. # perform some cleanup before the test for a higher success rate. - if [ ${CATEGORY} == "thp" ] | [ ${CATEGORY} == "hugetlb" ]; then + if [ ${CATEGORY} == "thp" -o ${CATEGORY} == "hugetlb" ]; then echo 3 > /proc/sys/vm/drop_caches sleep 2 echo 1 > /proc/sys/vm/compact_memory -- GitLab From 4998a6fa2a31176d0882bdfa27d5d03b665ba19b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 12 Feb 2025 09:35:20 -0800 Subject: [PATCH 710/989] MAINTAINERS: update Nick's contact info Updated .mailmap, but forgot these other places. Link: https://lkml.kernel.org/r/20250212173523.3979840-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton --- Documentation/process/embargoed-hardware-issues.rst | 2 +- .../translations/sp_SP/process/embargoed-hardware-issues.rst | 2 +- MAINTAINERS | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index daebce49cfdf5..0e19d2f0d6bbe 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -308,7 +308,7 @@ an involved disclosed party. The current ambassadors list: Google Kees Cook - LLVM Nick Desaulniers + LLVM Nick Desaulniers ============= ======================================================== If you want your organization to be added to the ambassadors list, please diff --git a/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst b/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst index 7d4d694967c73..9d444b9c46d39 100644 --- a/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst +++ b/Documentation/translations/sp_SP/process/embargoed-hardware-issues.rst @@ -287,7 +287,7 @@ revelada involucrada. La lista de embajadores actuales: Google Kees Cook - LLVM Nick Desaulniers + LLVM Nick Desaulniers ============= ======================================================== Si quiere que su organización se añada a la lista de embajadores, por diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f7..4e17764cb6ed4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5655,7 +5655,7 @@ F: .clang-format CLANG/LLVM BUILD SUPPORT M: Nathan Chancellor -R: Nick Desaulniers +R: Nick Desaulniers R: Bill Wendling R: Justin Stitt L: llvm@lists.linux.dev -- GitLab From ac7af1f57acd1e1d112b36e036584ca4bc4c284a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 17 Feb 2025 15:44:02 -0500 Subject: [PATCH 711/989] kasan: don't call find_vm_area() in a PREEMPT_RT kernel The following bug report was found when running a PREEMPT_RT debug kernel. BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 140605, name: kunit_try_catch preempt_count: 1, expected: 0 Call trace: rt_spin_lock+0x70/0x140 find_vmap_area+0x84/0x168 find_vm_area+0x1c/0x50 print_address_description.constprop.0+0x2a0/0x320 print_report+0x108/0x1f8 kasan_report+0x90/0xc8 Since commit e30a0361b851 ("kasan: make report_lock a raw spinlock"), report_lock was changed to raw_spinlock_t to fix another similar PREEMPT_RT problem. That alone isn't enough to cover other corner cases. print_address_description() is always invoked under the report_lock. The context under this lock is always atomic even on PREEMPT_RT. find_vm_area() acquires vmap_node::busy.lock which is a spinlock_t, becoming a sleeping lock on PREEMPT_RT and must not be acquired in atomic context. Don't invoke find_vm_area() on PREEMPT_RT and just print the address. Non-PREEMPT_RT builds remain unchanged. Add a DEFINE_WAIT_OVERRIDE_MAP() macro to tell lockdep that this lock nesting is allowed because the PREEMPT_RT part (which is invalid) has been taken care of. This macro was first introduced in commit 0cce06ba859a ("debugobjects,locking: Annotate debug_object_fill_pool() wait type violation"). Link: https://lkml.kernel.org/r/20250217204402.60533-1-longman@redhat.com Fixes: e30a0361b851 ("kasan: make report_lock a raw spinlock") Signed-off-by: Waiman Long Suggested-by: Andrey Konovalov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Steven Rostedt Cc: Mariano Pache Cc: Sebastian Andrzej Siewior Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/report.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3fe77a360f1c5..8357e1a33699b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +/* + * This function is invoked with report_lock (a raw_spinlock) held. A + * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping + * rt_spinlock. + * + * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a + * lockdep warning for this raw_spinlock -> spinlock dependency. This config + * option is enabled by default to ensure better test coverage to expose this + * kind of RT kernel problem. This lockdep splat, however, can be suppressed + * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the + * invalid PREEMPT_RT case has been taken care of. + */ +static inline struct vm_struct *kasan_find_vm_area(void *addr) +{ + static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP); + struct vm_struct *va; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return NULL; + + /* + * Suppress lockdep warning and fetch vmalloc area of the + * offending address. + */ + lock_map_acquire_try(&vmalloc_map); + va = find_vm_area(addr); + lock_map_release(&vmalloc_map); + return va; +} + static void print_address_description(void *addr, u8 tag, struct kasan_report_info *info) { @@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag, } if (is_vmalloc_addr(addr)) { - struct vm_struct *va = find_vm_area(addr); + struct vm_struct *va = kasan_find_vm_area(addr); if (va) { pr_err("The buggy address belongs to the virtual mapping at\n" @@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); page = vmalloc_to_page(addr); + } else { + pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr); } } -- GitLab From 8344017aaf32a7532cff293eb3df7fd2265ebafd Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Feb 2025 00:36:59 +0800 Subject: [PATCH 712/989] test_xarray: fix failure in check_pause when CONFIG_XARRAY_MULTI is not defined In case CONFIG_XARRAY_MULTI is not defined, xa_store_order can store a multi-index entry but xas_for_each can't tell sbiling entry from valid entry. So the check_pause failed when we store a multi-index entry and wish xas_for_each can handle it normally. Avoid to store multi-index entry when CONFIG_XARRAY_MULTI is disabled to fix the failure. Link: https://lkml.kernel.org/r/20250213163659.414309-1-shikemeng@huaweicloud.com Fixes: c9ba5249ef8b ("Xarray: move forward index correctly in xas_pause()") Signed-off-by: Kemeng Shi Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/r/CAMuHMdU_bfadUO=0OZ=AoQ9EAmQPA4wsLCBqohXR+QCeCKRn4A@mail.gmail.com Tested-by: Geert Uytterhoeven Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/test_xarray.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 6932a26f4927c..0e865bab4a10b 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1418,7 +1418,7 @@ static noinline void check_pause(struct xarray *xa) { XA_STATE(xas, xa, 0); void *entry; - unsigned int order; + int order; unsigned long index = 1; unsigned int count = 0; @@ -1450,7 +1450,7 @@ static noinline void check_pause(struct xarray *xa) xa_destroy(xa); index = 0; - for (order = XA_CHUNK_SHIFT; order > 0; order--) { + for (order = order_limit - 1; order >= 0; order--) { XA_BUG_ON(xa, xa_store_order(xa, index, order, xa_mk_index(index), GFP_KERNEL)); index += 1UL << order; @@ -1462,24 +1462,25 @@ static noinline void check_pause(struct xarray *xa) rcu_read_lock(); xas_for_each(&xas, entry, ULONG_MAX) { XA_BUG_ON(xa, entry != xa_mk_index(index)); - index += 1UL << (XA_CHUNK_SHIFT - count); + index += 1UL << (order_limit - count - 1); count++; } rcu_read_unlock(); - XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + XA_BUG_ON(xa, count != order_limit); index = 0; count = 0; - xas_set(&xas, XA_CHUNK_SIZE / 2 + 1); + /* test unaligned index */ + xas_set(&xas, 1 % (1UL << (order_limit - 1))); rcu_read_lock(); xas_for_each(&xas, entry, ULONG_MAX) { XA_BUG_ON(xa, entry != xa_mk_index(index)); - index += 1UL << (XA_CHUNK_SHIFT - count); + index += 1UL << (order_limit - count - 1); count++; xas_pause(&xas); } rcu_read_unlock(); - XA_BUG_ON(xa, count != XA_CHUNK_SHIFT); + XA_BUG_ON(xa, count != order_limit); xa_destroy(xa); -- GitLab From 02d954c0fdf91845169cdacc7405b120f90afe01 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 10 Feb 2025 16:32:50 +0100 Subject: [PATCH 713/989] sched: Compact RSEQ concurrency IDs with reduced threads and affinity When a process reduces its number of threads or clears bits in its CPU affinity mask, the mm_cid allocation should eventually converge towards smaller values. However, the change introduced by: commit 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") adds a per-mm/CPU recent_cid which is never unset unless a thread migrates. This is a tradeoff between: A) Preserving cache locality after a transition from many threads to few threads, or after reducing the hamming weight of the allowed CPU mask. B) Making the mm_cid upper bounds wrt nr threads and allowed CPU mask easy to document and understand. C) Allowing applications to eventually react to mm_cid compaction after reduction of the nr threads or allowed CPU mask, making the tracking of mm_cid compaction easier by shrinking it back towards 0 or not. D) Making sure applications that periodically reduce and then increase again the nr threads or allowed CPU mask still benefit from good cache locality with mm_cid. Introduce the following changes: * After shrinking the number of threads or reducing the number of allowed CPUs, reduce the value of max_nr_cid so expansion of CID allocation will preserve cache locality if the number of threads or allowed CPUs increase again. * Only re-use a recent_cid if it is within the max_nr_cid upper bound, else find the first available CID. Fixes: 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") Signed-off-by: Mathieu Desnoyers Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Tested-by: Gabriele Monaco Link: https://lkml.kernel.org/r/20250210153253.460471-2-gmonaco@redhat.com --- include/linux/mm_types.h | 7 ++++--- kernel/sched/sched.h | 25 ++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f94963..0234f14f2aa6b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,10 +875,11 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of concurrency IDs allocated. + * @max_nr_cid: Maximum number of allowed concurrency + * IDs allocated. * - * Track the highest number of concurrency IDs allocated for the - * mm. + * Track the highest number of allowed concurrency IDs + * allocated for the mm. */ atomic_t max_nr_cid; /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b93c8c3dc05a5..c8512a9fb0229 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3698,10 +3698,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3709,8 +3727,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) -- GitLab From ec5fd50aeff9c9156304853c6d75eda852d4a2c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 17 Feb 2025 08:43:35 +0100 Subject: [PATCH 714/989] uprobes: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restricted pointers ("%pK") are not meant to be used through printk(). It can unintentionally expose security sensitive, raw pointer values. Use regular pointer formatting instead. For more background, see: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Signed-off-by: Thomas Weißschuh Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250217-restricted-pointers-uprobes-v1-1-e8cbe5bb22a7@linutronix.de --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465f..bf2a87a0a3787 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -417,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); -- GitLab From 81570d6a7ad37033c7895811551a5a9023706eda Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 15 Feb 2025 10:56:55 +0100 Subject: [PATCH 715/989] gpiolib: protect gpio_chip with SRCU in array_info paths in multi get/set During the locking rework in GPIOLIB, we omitted one important use-case, namely: setting and getting values for GPIO descriptor arrays with array_info present. This patch does two things: first it makes struct gpio_array store the address of the underlying GPIO device and not chip. Next: it protects the chip with SRCU from removal in gpiod_get_array_value_complex() and gpiod_set_array_value_complex(). Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250215095655.23152-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 48 +++++++++++++++++++++++++++++------------- drivers/gpio/gpiolib.h | 4 ++-- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 29110dc436f15..5529d8b65f6fb 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3143,6 +3143,8 @@ static int gpiod_get_raw_value_commit(const struct gpio_desc *desc) static int gpio_chip_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + lockdep_assert_held(&gc->gpiodev->srcu); + if (gc->get_multiple) return gc->get_multiple(gc, mask, bits); if (gc->get) { @@ -3173,6 +3175,7 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, struct gpio_array *array_info, unsigned long *value_bitmap) { + struct gpio_chip *gc; int ret, i = 0; /* @@ -3184,10 +3187,15 @@ int gpiod_get_array_value_complex(bool raw, bool can_sleep, array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) - WARN_ON(array_info->chip->can_sleep); + WARN_ON(array_info->gdev->can_sleep); + + guard(srcu)(&array_info->gdev->srcu); + gc = srcu_dereference(array_info->gdev->chip, + &array_info->gdev->srcu); + if (!gc) + return -ENODEV; - ret = gpio_chip_get_multiple(array_info->chip, - array_info->get_mask, + ret = gpio_chip_get_multiple(gc, array_info->get_mask, value_bitmap); if (ret) return ret; @@ -3468,6 +3476,8 @@ static void gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) static void gpio_chip_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + lockdep_assert_held(&gc->gpiodev->srcu); + if (gc->set_multiple) { gc->set_multiple(gc, mask, bits); } else { @@ -3485,6 +3495,7 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, struct gpio_array *array_info, unsigned long *value_bitmap) { + struct gpio_chip *gc; int i = 0; /* @@ -3496,14 +3507,19 @@ int gpiod_set_array_value_complex(bool raw, bool can_sleep, array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) - WARN_ON(array_info->chip->can_sleep); + WARN_ON(array_info->gdev->can_sleep); + + guard(srcu)(&array_info->gdev->srcu); + gc = srcu_dereference(array_info->gdev->chip, + &array_info->gdev->srcu); + if (!gc) + return -ENODEV; if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); - gpio_chip_set_multiple(array_info->chip, array_info->set_mask, - value_bitmap); + gpio_chip_set_multiple(gc, array_info->set_mask, value_bitmap); i = find_first_zero_bit(array_info->set_mask, array_size); if (i == array_size) @@ -4765,9 +4781,10 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, { struct gpio_desc *desc; struct gpio_descs *descs; + struct gpio_device *gdev; struct gpio_array *array_info = NULL; - struct gpio_chip *gc; int count, bitmap_size; + unsigned long dflags; size_t descs_size; count = gpiod_count(dev, con_id); @@ -4788,7 +4805,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, descs->desc[descs->ndescs] = desc; - gc = gpiod_to_chip(desc); + gdev = gpiod_to_gpio_device(desc); /* * If pin hardware number of array member 0 is also 0, select * its chip as a candidate for fast bitmap processing path. @@ -4796,8 +4813,8 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, if (descs->ndescs == 0 && gpio_chip_hwgpio(desc) == 0) { struct gpio_descs *array; - bitmap_size = BITS_TO_LONGS(gc->ngpio > count ? - gc->ngpio : count); + bitmap_size = BITS_TO_LONGS(gdev->ngpio > count ? + gdev->ngpio : count); array = krealloc(descs, descs_size + struct_size(array_info, invert_mask, 3 * bitmap_size), @@ -4817,7 +4834,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, array_info->desc = descs->desc; array_info->size = count; - array_info->chip = gc; + array_info->gdev = gdev; bitmap_set(array_info->get_mask, descs->ndescs, count - descs->ndescs); bitmap_set(array_info->set_mask, descs->ndescs, @@ -4830,7 +4847,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, continue; /* Unmark array members which don't belong to the 'fast' chip */ - if (array_info->chip != gc) { + if (array_info->gdev != gdev) { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } @@ -4853,9 +4870,10 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, array_info->set_mask); } } else { + dflags = READ_ONCE(desc->flags); /* Exclude open drain or open source from fast output */ - if (gpiochip_line_is_open_drain(gc, descs->ndescs) || - gpiochip_line_is_open_source(gc, descs->ndescs)) + if (test_bit(FLAG_OPEN_DRAIN, &dflags) || + test_bit(FLAG_OPEN_SOURCE, &dflags)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ @@ -4867,7 +4885,7 @@ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, if (array_info) dev_dbg(dev, "GPIO array info: chip=%s, size=%d, get_mask=%lx, set_mask=%lx, invert_mask=%lx\n", - array_info->chip->label, array_info->size, + array_info->gdev->label, array_info->size, *array_info->get_mask, *array_info->set_mask, *array_info->invert_mask); return descs; diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h index 83690f72f7e5c..147156ec502b2 100644 --- a/drivers/gpio/gpiolib.h +++ b/drivers/gpio/gpiolib.h @@ -114,7 +114,7 @@ extern const char *const gpio_suffixes[]; * * @desc: Array of pointers to the GPIO descriptors * @size: Number of elements in desc - * @chip: Parent GPIO chip + * @gdev: Parent GPIO device * @get_mask: Get mask used in fastpath * @set_mask: Set mask used in fastpath * @invert_mask: Invert mask used in fastpath @@ -126,7 +126,7 @@ extern const char *const gpio_suffixes[]; struct gpio_array { struct gpio_desc **desc; unsigned int size; - struct gpio_chip *chip; + struct gpio_device *gdev; unsigned long *get_mask; unsigned long *set_mask; unsigned long invert_mask[]; -- GitLab From 8fb5bb169d17cdd12c2dcc2e96830ed487d77a0f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:49 +0100 Subject: [PATCH 716/989] sockmap, vsock: For connectible sockets allow only connected sockmap expects all vsocks to have a transport assigned, which is expressed in vsock_proto::psock_update_sk_prot(). However, there is an edge case where an unconnected (connectible) socket may lose its previously assigned transport. This is handled with a NULL check in the vsock/BPF recv path. Another design detail is that listening vsocks are not supposed to have any transport assigned at all. Which implies they are not supported by the sockmap. But this is complicated by the fact that a socket, before switching to TCP_LISTEN, may have had some transport assigned during a failed connect() attempt. Hence, we may end up with a listening vsock in a sockmap, which blows up quickly: KASAN: null-ptr-deref in range [0x0000000000000120-0x0000000000000127] CPU: 7 UID: 0 PID: 56 Comm: kworker/7:0 Not tainted 6.14.0-rc1+ Workqueue: vsock-loopback vsock_loopback_work RIP: 0010:vsock_read_skb+0x4b/0x90 Call Trace: sk_psock_verdict_data_ready+0xa4/0x2e0 virtio_transport_recv_pkt+0x1ca8/0x2acc vsock_loopback_work+0x27d/0x3f0 process_one_work+0x846/0x1420 worker_thread+0x5b3/0xf80 kthread+0x35a/0x700 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 For connectible sockets, instead of relying solely on the state of vsk->transport, tell sockmap to only allow those representing established connections. This aligns with the behaviour for AF_INET and AF_UNIX. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/core/sock_map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792c..2f1be9baad057 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -541,6 +541,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } -- GitLab From 857ae05549ee2542317e7084ecaa5f8536634dd9 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:50 +0100 Subject: [PATCH 717/989] vsock/bpf: Warn on socket without transport In the spirit of commit 91751e248256 ("vsock: prevent null-ptr-deref in vsock_*[has_data|has_space]"), armorize the "impossible" cases with a warning. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 3 +++ net/vmw_vsock/vsock_bpf.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 53a081d49d28a..7e3db87ae4333 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1189,6 +1189,9 @@ static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); + if (WARN_ON_ONCE(!vsk->transport)) + return -ENODEV; + return vsk->transport->read_skb(vsk, read_actor); } diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index f201d9eca1df2..07b96d56f3a57 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -87,7 +87,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, lock_sock(sk); vsk = vsock_sk(sk); - if (!vsk->transport) { + if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } -- GitLab From 8350695bfb169b1924626a68f76b369ad01f18f2 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:51 +0100 Subject: [PATCH 718/989] selftest/bpf: Adapt vsock_delete_on_close to sockmap rejecting unconnected Commit 515745445e92 ("selftest/bpf: Add test for vsock removal from sockmap on close()") added test that checked if proto::close() callback was invoked on AF_VSOCK socket release. I.e. it verified that a close()d vsock does indeed get removed from the sockmap. It was done simply by creating a socket pair and attempting to replace a close()d one with its peer. Since, due to a recent change, sockmap does not allow updating index with a non-established connectible vsock, redo it with a freshly established one. Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- .../selftests/bpf/prog_tests/sockmap_basic.c | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 884ad87783d59..21793d8c79e12 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -111,31 +111,35 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type) static void test_sockmap_vsock_delete_on_close(void) { - int err, c, p, map; - const int zero = 0; - - err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); - if (!ASSERT_OK(err, "create_pair(AF_VSOCK)")) - return; + int map, c, p, err, zero = 0; map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), sizeof(int), 1, NULL); - if (!ASSERT_GE(map, 0, "bpf_map_create")) { - close(c); - goto out; - } + if (!ASSERT_OK_FD(map, "bpf_map_create")) + return; - err = bpf_map_update_elem(map, &zero, &c, BPF_NOEXIST); - close(c); - if (!ASSERT_OK(err, "bpf_map_update")) - goto out; + err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); + if (!ASSERT_OK(err, "create_pair")) + goto close_map; - err = bpf_map_update_elem(map, &zero, &p, BPF_NOEXIST); + if (xbpf_map_update_elem(map, &zero, &c, BPF_NOEXIST)) + goto close_socks; + + xclose(c); + xclose(p); + + err = create_pair(AF_VSOCK, SOCK_STREAM, &c, &p); + if (!ASSERT_OK(err, "create_pair")) + goto close_map; + + err = bpf_map_update_elem(map, &zero, &c, BPF_NOEXIST); ASSERT_OK(err, "after close(), bpf_map_update"); -out: - close(p); - close(map); +close_socks: + xclose(c); + xclose(p); +close_map: + xclose(map); } static void test_skmsg_helpers(enum bpf_map_type map_type) -- GitLab From 85928e9c436398abcac32a9afa2f591895dd497d Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 13 Feb 2025 12:58:52 +0100 Subject: [PATCH 719/989] selftest/bpf: Add vsock test for sockmap rejecting unconnected Verify that for a connectible AF_VSOCK socket, merely having a transport assigned is insufficient; socket must be connected for the sockmap to accept. This does not test datagram vsocks. Even though it hardly matters. VMCI is the only transport that features VSOCK_TRANSPORT_F_DGRAM, but it has an unimplemented vsock_transport::readskb() callback, making it unsupported by BPF/sockmap. Signed-off-by: Michal Luczaj Acked-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- .../selftests/bpf/prog_tests/sockmap_basic.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 21793d8c79e12..05eb37935c3e2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1065,6 +1065,34 @@ static void test_sockmap_skb_verdict_vsock_poll(void) test_sockmap_pass_prog__destroy(skel); } +static void test_sockmap_vsock_unconnected(void) +{ + struct sockaddr_storage addr; + int map, s, zero = 0; + socklen_t alen; + + map = bpf_map_create(BPF_MAP_TYPE_SOCKMAP, NULL, sizeof(int), + sizeof(int), 1, NULL); + if (!ASSERT_OK_FD(map, "bpf_map_create")) + return; + + s = xsocket(AF_VSOCK, SOCK_STREAM, 0); + if (s < 0) + goto close_map; + + /* Fail connect(), but trigger transport assignment. */ + init_addr_loopback(AF_VSOCK, &addr, &alen); + if (!ASSERT_ERR(connect(s, sockaddr(&addr), alen), "connect")) + goto close_sock; + + ASSERT_ERR(bpf_map_update_elem(map, &zero, &s, BPF_ANY), "map_update"); + +close_sock: + xclose(s); +close_map: + xclose(map); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1131,4 +1159,6 @@ void test_sockmap_basic(void) test_skmsg_helpers_with_link(BPF_MAP_TYPE_SOCKHASH); if (test__start_subtest("sockmap skb_verdict vsock poll")) test_sockmap_skb_verdict_vsock_poll(); + if (test__start_subtest("sockmap vsock unconnected")) + test_sockmap_vsock_unconnected(); } -- GitLab From fcd875445866a5219cf2be3101e276b21fc843f3 Mon Sep 17 00:00:00 2001 From: Christopher Lentocha Date: Tue, 18 Feb 2025 08:59:29 -0500 Subject: [PATCH 720/989] nvme-pci: quirk Acer FA100 for non-uniqueue identifiers In order for two Acer FA100 SSDs to work in one PC (in the case of myself, a Lenovo Legion T5 28IMB05), and not show one drive and not the other, and sometimes mix up what drive shows up (randomly), these two lines of code need to be added, and then both of the SSDs will show up and not conflict when booting off of one of them. If you boot up your computer with both SSDs installed without this patch, you may also randomly get into a kernel panic (if the initrd is not set up) or stuck in the initrd "/init" process, it is set up, however, if you do apply this patch, there should not be problems with booting or seeing both contents of the drive. Tested with the btrfs filesystem with a RAID configuration of having the root drive '/' combined to make two 256GB Acer FA100 SSDs become 512GB in total storage. Kernel Logs with patch applied (`dmesg -t | grep -i nvm`): ``` ... nvme 0000:04:00.0: platform quirk: setting simple suspend nvme nvme0: pci function 0000:04:00.0 nvme 0000:05:00.0: platform quirk: setting simple suspend nvme nvme1: pci function 0000:05:00.0 nvme nvme1: missing or invalid SUBNQN field. nvme nvme1: allocated 64 MiB host memory buffer. nvme nvme0: missing or invalid SUBNQN field. nvme nvme0: allocated 64 MiB host memory buffer. nvme nvme1: 8/0/0 default/read/poll queues nvme nvme1: Ignoring bogus Namespace Identifiers nvme nvme0: 8/0/0 default/read/poll queues nvme nvme0: Ignoring bogus Namespace Identifiers nvme0n1: p1 p2 ... ``` Kernel Logs with patch not applied (`dmesg -t | grep -i nvm`): ``` ... nvme 0000:04:00.0: platform quirk: setting simple suspend nvme nvme0: pci function 0000:04:00.0 nvme 0000:05:00.0: platform quirk: setting simple suspend nvme nvme1: pci function 0000:05:00.0 nvme nvme0: missing or invalid SUBNQN field. nvme nvme1: missing or invalid SUBNQN field. nvme nvme0: allocated 64 MiB host memory buffer. nvme nvme1: allocated 64 MiB host memory buffer. nvme nvme0: 8/0/0 default/read/poll queues nvme nvme1: 8/0/0 default/read/poll queues nvme nvme1: globally duplicate IDs for nsid 1 nvme nvme1: VID:DID 1dbe:5216 model:Acer SSD FA100 256GB firmware:1.Z.J.2X nvme0n1: p1 p2 ... ``` Signed-off-by: Christopher Lentocha Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9197a5b173fdf..950289405ef28 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3706,6 +3706,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc1, 0x5350), /* ADATA XPG GAMMIX S50 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1dbe, 0x5216), /* Acer/INNOGRIT FA100/5216 NVMe SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1dbe, 0x5236), /* ADATA XPG GAMMIX S70 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e49, 0x0021), /* ZHITAI TiPro5000 NVMe SSD */ -- GitLab From 84e009042d0f3dfe91bec60bcd208ee3f866cbcd Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 17 Feb 2025 17:08:27 +0100 Subject: [PATCH 721/989] nvme-tcp: add basic support for the C2HTermReq PDU Previously, the NVMe/TCP host driver did not handle the C2HTermReq PDU, instead printing "unsupported pdu type (3)" when received. This patch adds support for processing the C2HTermReq PDU, allowing the driver to print the Fatal Error Status field. Example of output: nvme nvme4: Received C2HTermReq (FES = Invalid PDU Header Field) Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 43 ++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-tcp.h | 2 ++ 2 files changed, 45 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 841238f38fdda..038b35238c26d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -763,6 +763,40 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return 0; } +static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue, + struct nvme_tcp_term_pdu *pdu) +{ + u16 fes; + const char *msg; + u32 plen = le32_to_cpu(pdu->hdr.plen); + + static const char * const msg_table[] = { + [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field", + [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error", + [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error", + [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range", + [NVME_TCP_FES_R2T_LIMIT_EXCEEDED] = "R2T Limit Exceeded", + [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter", + }; + + if (plen < NVME_TCP_MIN_C2HTERM_PLEN || + plen > NVME_TCP_MAX_C2HTERM_PLEN) { + dev_err(queue->ctrl->ctrl.device, + "Received a malformed C2HTermReq PDU (plen = %u)\n", + plen); + return; + } + + fes = le16_to_cpu(pdu->fes); + if (fes && fes < ARRAY_SIZE(msg_table)) + msg = msg_table[fes]; + else + msg = "Unknown"; + + dev_err(queue->ctrl->ctrl.device, + "Received C2HTermReq (FES = %s)\n", msg); +} + static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -784,6 +818,15 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->type == nvme_tcp_c2h_term)) { + /* + * C2HTermReq never includes Header or Data digests. + * Skip the checks. + */ + nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); + return -EINVAL; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index e07e8978d691b..e435250fcb4d0 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -13,6 +13,8 @@ #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 #define NVME_TCP_MIN_MAXH2CDATA 4096 +#define NVME_TCP_MIN_C2HTERM_PLEN 24 +#define NVME_TCP_MAX_C2HTERM_PLEN 152 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- GitLab From 4082326807072b71496501b6a0c55ffe8d5092a5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Feb 2025 13:41:34 +0100 Subject: [PATCH 722/989] nvmet: Fix crash when a namespace is disabled The namespace percpu counter protects pending I/O, and we can only safely diable the namespace once the counter drop to zero. Otherwise we end up with a crash when running blktests/nvme/058 (eg for loop transport): [ 2352.930426] [ T53909] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN PTI [ 2352.930431] [ T53909] KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [ 2352.930434] [ T53909] CPU: 3 UID: 0 PID: 53909 Comm: kworker/u16:5 Tainted: G W 6.13.0-rc6 #232 [ 2352.930438] [ T53909] Tainted: [W]=WARN [ 2352.930440] [ T53909] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 [ 2352.930443] [ T53909] Workqueue: nvmet-wq nvme_loop_execute_work [nvme_loop] [ 2352.930449] [ T53909] RIP: 0010:blkcg_set_ioprio+0x44/0x180 as the queue is already torn down when calling submit_bio(); So we need to init the percpu counter in nvmet_ns_enable(), and wait for it to drop to zero in nvmet_ns_disable() to avoid having I/O pending after the namespace has been disabled. Fixes: 74d16965d7ac ("nvmet-loop: avoid using mutex in IO hotpath") Signed-off-by: Hannes Reinecke Reviewed-by: Nilay Shroff Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Tested-by: Shin'ichiro Kawasaki Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 40 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cdc4a09a6e8a4..2e741696f3712 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -606,6 +606,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns) goto out_dev_put; } + if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) + goto out_pr_exit; + nvmet_ns_changed(subsys, ns->nsid); ns->enabled = true; xa_set_mark(&subsys->namespaces, ns->nsid, NVMET_NS_ENABLED); @@ -613,6 +616,9 @@ int nvmet_ns_enable(struct nvmet_ns *ns) out_unlock: mutex_unlock(&subsys->lock); return ret; +out_pr_exit: + if (ns->pr.enable) + nvmet_pr_exit_ns(ns); out_dev_put: list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid)); @@ -638,6 +644,19 @@ void nvmet_ns_disable(struct nvmet_ns *ns) mutex_unlock(&subsys->lock); + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + if (ns->pr.enable) nvmet_pr_exit_ns(ns); @@ -660,22 +679,6 @@ void nvmet_ns_free(struct nvmet_ns *ns) if (ns->nsid == subsys->max_nsid) subsys->max_nsid = nvmet_max_nsid(subsys); - mutex_unlock(&subsys->lock); - - /* - * Now that we removed the namespaces from the lookup list, we - * can kill the per_cpu ref and wait for any remaining references - * to be dropped, as well as a RCU grace period for anyone only - * using the namepace under rcu_read_lock(). Note that we can't - * use call_rcu here as we need to ensure the namespaces have - * been fully destroyed before unloading the module. - */ - percpu_ref_kill(&ns->ref); - synchronize_rcu(); - wait_for_completion(&ns->disable_done); - percpu_ref_exit(&ns->ref); - - mutex_lock(&subsys->lock); subsys->nr_namespaces--; mutex_unlock(&subsys->lock); @@ -705,9 +708,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; - if (percpu_ref_init(&ns->ref, nvmet_destroy_namespace, 0, GFP_KERNEL)) - goto out_free; - if (ns->nsid > subsys->max_nsid) subsys->max_nsid = nsid; @@ -730,8 +730,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) return ns; out_exit: subsys->max_nsid = nvmet_max_nsid(subsys); - percpu_ref_exit(&ns->ref); -out_free: kfree(ns); out_unlock: mutex_unlock(&subsys->lock); -- GitLab From 3988ac1c67e6e84d2feb987d7b36d5791174b3da Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Sun, 16 Feb 2025 20:49:56 +0800 Subject: [PATCH 723/989] nvmet-rdma: recheck queue state is LIVE in state lock in recv done The queue state checking in nvmet_rdma_recv_done is not in queue state lock.Queue state can transfer to LIVE in cm establish handler between state checking and state lock here, cause a silent drop of nvme connect cmd. Recheck queue state whether in LIVE state in state lock to prevent this issue. Signed-off-by: Ruozhu Li Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1afd93026f9bf..2a4536ef61848 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -996,6 +996,27 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, nvmet_req_complete(&cmd->req, status); } +static bool nvmet_rdma_recv_not_live(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&queue->state_lock, flags); + /* + * recheck queue state is not live to prevent a race condition + * with RDMA_CM_EVENT_ESTABLISHED handler. + */ + if (queue->state == NVMET_RDMA_Q_LIVE) + ret = false; + else if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return ret; +} + static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_cmd *cmd = @@ -1038,17 +1059,9 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) rsp->n_rdma = 0; rsp->invalidate_rkey = 0; - if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { - unsigned long flags; - - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state == NVMET_RDMA_Q_CONNECTING) - list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); - else - nvmet_rdma_put_rsp(rsp); - spin_unlock_irqrestore(&queue->state_lock, flags); + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE) && + nvmet_rdma_recv_not_live(queue, rsp)) return; - } nvmet_rdma_handle_command(queue, rsp); } -- GitLab From 68a5c91f01fc9f086567b260cced003ed9fdff3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:28 +0900 Subject: [PATCH 724/989] nvmet: pci-epf: Correctly initialize CSTS when enabling the controller The function nvmet_pci_epf_poll_cc_work() sets the NVME_CSTS_RDY bit of the controller status register (CSTS) when nvmet_pci_epf_enable_ctrl() returns success. However, since this function can be called several times (e.g. if the host reboots), instead of setting the bit in ctrl->csts, initialize this field to only have NVME_CSTS_RDY set. Conversely, if nvmet_pci_epf_enable_ctrl() fails, make sure to clear all bits from ctrl->csts. To simplify nvmet_pci_epf_poll_cc_work(), initialize ctrl->csts to NVME_CSTS_RDY directly inside nvmet_pci_epf_enable_ctrl() and clear this field in that function as well in case of a failure. To be consistent, move clearing the NVME_CSTS_RDY bit from ctrl->csts when the controller is being disabled from nvmet_pci_epf_poll_cc_work() into nvmet_pci_epf_disable_ctrl(). Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index ac30b42cc6221..efd4623fb0022 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1822,14 +1822,14 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) if (ctrl->io_sqes < sizeof(struct nvme_command)) { dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n", ctrl->io_sqes, sizeof(struct nvme_command)); - return -EINVAL; + goto err; } ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc); if (ctrl->io_cqes < sizeof(struct nvme_completion)) { dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n", ctrl->io_sqes, sizeof(struct nvme_completion)); - return -EINVAL; + goto err; } /* Create the admin queue. */ @@ -1844,7 +1844,7 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) qsize, pci_addr, 0); if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin completion queue\n"); - return -EINVAL; + goto err; } qsize = aqa & 0x00000fff; @@ -1854,17 +1854,22 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) if (status != NVME_SC_SUCCESS) { dev_err(ctrl->dev, "Failed to create admin submission queue\n"); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); - return -EINVAL; + goto err; } ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB; ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD; ctrl->enabled = true; + ctrl->csts = NVME_CSTS_RDY; /* Start polling the controller SQs. */ schedule_delayed_work(&ctrl->poll_sqs, 0); return 0; + +err: + ctrl->csts = 0; + return -EINVAL; } static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) @@ -1889,6 +1894,8 @@ static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) /* Delete the admin queue last. */ nvmet_pci_epf_delete_sq(ctrl->tctrl, 0); nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); + + ctrl->csts &= ~NVME_CSTS_RDY; } static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) @@ -1909,13 +1916,10 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) ret = nvmet_pci_epf_enable_ctrl(ctrl); if (ret) return; - ctrl->csts |= NVME_CSTS_RDY; } - if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) { + if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) nvmet_pci_epf_disable_ctrl(ctrl); - ctrl->csts &= ~NVME_CSTS_RDY; - } if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) { nvmet_pci_epf_disable_ctrl(ctrl); -- GitLab From ffa35567632c0059e7f380ed155e26a07ec4153f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:29 +0900 Subject: [PATCH 725/989] nvmet: pci-epf: Do not uselessly write the CSTS register The function nvmet_pci_epf_poll_cc_work() will do nothing if there are no changes to the controller configuration (CC) register. However, even for such case, this function still calls nvmet_update_cc() and uselessly writes the CSTS register. Avoid this by simply rescheduling the poll_cc work if the CC register has not changed. Also reschedule the poll_cc work if the function nvmet_pci_epf_enable_ctrl() fails to allow the host the chance to try again enabling the controller. While at it, since there is no point in trying to handle the CC register as quickly as possible, change the poll_cc work scheduling interval to 10 ms (from 5ms), to avoid excessive read accesses to that register. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index efd4623fb0022..b646a8f468ea9 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -46,7 +46,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); /* * BAR CC register and SQ polling intervals. */ -#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(5) +#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(10) #define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5) #define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000) @@ -1910,12 +1910,15 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) old_cc = ctrl->cc; new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC); + if (new_cc == old_cc) + goto reschedule_work; + ctrl->cc = new_cc; if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) { ret = nvmet_pci_epf_enable_ctrl(ctrl); if (ret) - return; + goto reschedule_work; } if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) @@ -1932,6 +1935,7 @@ static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) nvmet_update_cc(ctrl->tctrl, ctrl->cc); nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts); +reschedule_work: schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); } -- GitLab From 01ef7ff7dd3cd8cc71af8d1d1496be853281a948 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:30 +0900 Subject: [PATCH 726/989] nvmet: pci-epf: Avoid RCU stalls under heavy workload The delayed work item function nvmet_pci_epf_poll_sqs_work() polls all submission queues and keeps running in a loop as long as commands are being submitted by the host. Depending on the preemption configuration of the kernel, under heavy command workload, this function can thus run for more than RCU_CPU_STALL_TIMEOUT seconds, leading to a RCU stall: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 5-....: (20998 ticks this GP) idle=4244/1/0x4000000000000000 softirq=301/301 fqs=5132 rcu: (t=21000 jiffies g=-443 q=12 ncpus=8) CPU: 5 UID: 0 PID: 82 Comm: kworker/5:1 Not tainted 6.14.0-rc2 #1 Hardware name: Radxa ROCK 5B (DT) Workqueue: events nvmet_pci_epf_poll_sqs_work [nvmet_pci_epf] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dw_edma_device_tx_status+0xb8/0x130 lr : dw_edma_device_tx_status+0x9c/0x130 sp : ffff800080b5bbb0 x29: ffff800080b5bbb0 x28: ffff0331c5c78400 x27: ffff0331c1cd1960 x26: ffff0331c0e39010 x25: ffff0331c20e4000 x24: ffff0331c20e4a90 x23: 0000000000000000 x22: 0000000000000001 x21: 00000000005aca33 x20: ffff800080b5bc30 x19: ffff0331c123e370 x18: 000000000ab29e62 x17: ffffb2a878c9c118 x16: ffff0335bde82040 x15: 0000000000000000 x14: 000000000000017b x13: 00000000ee601780 x12: 0000000000000018 x11: 0000000000000000 x10: 0000000000000001 x9 : 0000000000000040 x8 : 00000000ee601780 x7 : 0000000105c785c0 x6 : ffff0331c1027d80 x5 : 0000000001ee7ad6 x4 : ffff0335bdea16c0 x3 : ffff0331c123e438 x2 : 00000000005aca33 x1 : 0000000000000000 x0 : ffff0331c123e410 Call trace: dw_edma_device_tx_status+0xb8/0x130 (P) dma_sync_wait+0x60/0xbc nvmet_pci_epf_dma_transfer+0x128/0x264 [nvmet_pci_epf] nvmet_pci_epf_poll_sqs_work+0x2a0/0x2e0 [nvmet_pci_epf] process_one_work+0x144/0x390 worker_thread+0x27c/0x458 kthread+0xe8/0x19c ret_from_fork+0x10/0x20 The solution for this is simply to explicitly allow rescheduling using cond_resched(). However, since doing so for every loop of nvmet_pci_epf_poll_sqs_work() significantly degrades performance (for 4K random reads using 4 I/O queues, the maximum IOPS goes down from 137 KIOPS to 110 KIOPS), call cond_resched() every second to avoid the RCU stalls. Fixes: 0faa0fe6f90e ("nvmet: New NVMe PCI endpoint function target driver") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/pci-epf.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c index b646a8f468ea9..565d2bd36dcde 100644 --- a/drivers/nvme/target/pci-epf.c +++ b/drivers/nvme/target/pci-epf.c @@ -1694,6 +1694,7 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) struct nvmet_pci_epf_ctrl *ctrl = container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work); struct nvmet_pci_epf_queue *sq; + unsigned long limit = jiffies; unsigned long last = 0; int i, nr_sqs; @@ -1708,6 +1709,16 @@ static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) nr_sqs++; } + /* + * If we have been running for a while, reschedule to let other + * tasks run and to avoid RCU stalls. + */ + if (time_is_before_jiffies(limit + secs_to_jiffies(1))) { + cond_resched(); + limit = jiffies; + continue; + } + if (nr_sqs) { last = jiffies; continue; -- GitLab From cd513e0434c3e736c549bc99bf7982658b25114d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:52:31 +0900 Subject: [PATCH 727/989] nvme: tcp: Fix compilation warning with W=1 When compiling with W=1, a warning result for the function nvme_tcp_set_queue_io_cpu(): host/tcp.c:1578: warning: Function parameter or struct member 'queue' not described in 'nvme_tcp_set_queue_io_cpu' host/tcp.c:1578: warning: expecting prototype for Track the number of queues assigned to each cpu using a global per(). Prototype was for nvme_tcp_set_queue_io_cpu() instead Avoid this warning by using the regular comment format for the function nvme_tcp_set_queue_io_cpu() instead of the kdoc comment format. Fixes: 32193789878c ("nvme-tcp: Fix I/O queue cpu spreading for multiple controllers") Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 038b35238c26d..a7398e9e5f71c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1608,7 +1608,7 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) ctrl->io_queues[HCTX_TYPE_POLL]; } -/** +/* * Track the number of queues assigned to each cpu using a global per-cpu * counter and select the least used cpu from the mq_map. Our goal is to spread * different controllers I/O threads across different cpu cores. -- GitLab From 578539e0969028f711c34d9a4565931edfe1d730 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 24 Jan 2025 11:43:10 -0700 Subject: [PATCH 728/989] nvme-tcp: fix connect failure on receiving partial ICResp PDU nvme_tcp_init_connection() attempts to receive an ICResp PDU but only checks that the return value from recvmsg() is non-negative. If the sender closes the TCP connection or sends fewer than 128 bytes, this check will pass even though the full PDU wasn't received. Ensure the full ICResp PDU is received by checking that recvmsg() returns the expected 128 bytes. Additionally set the MSG_WAITALL flag for recvmsg(), as a sender could split the ICResp over multiple TCP frames. Without MSG_WAITALL, recvmsg() could return prematurely with only part of the PDU. Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Signed-off-by: Caleb Sander Mateos Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a7398e9e5f71c..8a9131c95a3da 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1492,11 +1492,14 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) msg.msg_control = cbuf; msg.msg_controllen = sizeof(cbuf); } + msg.msg_flags = MSG_WAITALL; ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret < 0) { + if (ret < sizeof(*icresp)) { pr_warn("queue %d: failed to receive icresp, error %d\n", nvme_tcp_queue_id(queue), ret); + if (ret >= 0) + ret = -ECONNRESET; goto free_icresp; } ret = -ENOTCONN; -- GitLab From 487a3ea7b1b8ba2ca7d2c2bb3c3594dc360d6261 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 13 Feb 2025 10:05:14 -0700 Subject: [PATCH 729/989] nvme/ioctl: add missing space in err message nvme_validate_passthru_nsid() logs an err message whose format string is split over 2 lines. There is a missing space between the two pieces, resulting in log lines like "... does not match nsid (1)of namespace". Add the missing space between ")" and "of". Also combine the format string pieces onto a single line to make the err message easier to grep. Fixes: e7d4b5493a2d ("nvme: factor out a nvme_validate_passthru_nsid helper") Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e8930146847af..b1b46c2713e1c 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -283,8 +283,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, { if (ns && nsid != ns->head->ns_id) { dev_err(ctrl->device, - "%s: nsid (%u) in cmd does not match nsid (%u)" - "of namespace\n", + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", current->comm, nsid, ns->head->ns_id); return false; } -- GitLab From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:49:59 +0900 Subject: [PATCH 730/989] nvme: Cleanup the definition of the controller config register fields Reorganized the enum used to define the fields of the contrller configuration (CC) register in include/linux/nvme.h to: 1) Group together all the values defined for each field. 2) Add the missing field masks definitions. 3) Add comments to describe the enum and each field. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdcf..2dc05b1c3283d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { -- GitLab From 2ba8cf918f0d1873cd5430ae2cc3c41711a144d7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:50:00 +0900 Subject: [PATCH 731/989] nvmet: Use enum definitions instead of hardcoded values Change the definition of the inline functions nvmet_cc_en(), nvmet_cc_css(), nvmet_cc_mps(), nvmet_cc_ams(), nvmet_cc_shn(), nvmet_cc_iosqes(), and nvmet_cc_iocqes() to use the enum difinitions in include/linux/nvme.h instead of hardcoded values. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4be8d22d2d8d4..d2c1233981e1a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -784,37 +784,37 @@ u16 nvmet_report_invalid_opcode(struct nvmet_req *req); static inline bool nvmet_cc_en(u32 cc) { - return (cc >> NVME_CC_EN_SHIFT) & 0x1; + return (cc & NVME_CC_ENABLE) >> NVME_CC_EN_SHIFT; } static inline u8 nvmet_cc_css(u32 cc) { - return (cc >> NVME_CC_CSS_SHIFT) & 0x7; + return (cc & NVME_CC_CSS_MASK) >> NVME_CC_CSS_SHIFT; } static inline u8 nvmet_cc_mps(u32 cc) { - return (cc >> NVME_CC_MPS_SHIFT) & 0xf; + return (cc & NVME_CC_MPS_MASK) >> NVME_CC_MPS_SHIFT; } static inline u8 nvmet_cc_ams(u32 cc) { - return (cc >> NVME_CC_AMS_SHIFT) & 0x7; + return (cc & NVME_CC_AMS_MASK) >> NVME_CC_AMS_SHIFT; } static inline u8 nvmet_cc_shn(u32 cc) { - return (cc >> NVME_CC_SHN_SHIFT) & 0x3; + return (cc & NVME_CC_SHN_MASK) >> NVME_CC_SHN_SHIFT; } static inline u8 nvmet_cc_iosqes(u32 cc) { - return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOSQES_MASK) >> NVME_CC_IOSQES_SHIFT; } static inline u8 nvmet_cc_iocqes(u32 cc) { - return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; + return (cc & NVME_CC_IOCQES_MASK) >> NVME_CC_IOCQES_SHIFT; } /* Convert a 32-bit number to a 16-bit 0's based number */ -- GitLab From eefa72a15ea03fd009333aaa9f0e360b2578e434 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 13 Feb 2025 11:12:59 -0500 Subject: [PATCH 732/989] apple-nvme: Release power domains when probe fails Signed-off-by: Hector Martin Reviewed-by: Neal Gompa Reviewed-by: Sven Peter Signed-off-by: Alyssa Rosenzweig Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 1de11b722f049..7995f0776bc06 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1516,6 +1516,7 @@ static struct apple_nvme *apple_nvme_alloc(struct platform_device *pdev) return anv; put_dev: + apple_nvme_detach_genpd(anv); put_device(anv->dev); return ERR_PTR(ret); } @@ -1549,6 +1550,7 @@ static int apple_nvme_probe(struct platform_device *pdev) nvme_uninit_ctrl(&anv->ctrl); out_put_ctrl: nvme_put_ctrl(&anv->ctrl); + apple_nvme_detach_genpd(anv); return ret; } -- GitLab From 3f22421f6a240b33ab8ffbf662bf0a8f336f405b Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 13 Feb 2025 11:12:58 -0500 Subject: [PATCH 733/989] apple-nvme: Support coprocessors left idle iBoot on at least some firmwares/machines leaves ANS2 running, requiring a wake command instead of a CPU boot (and if we reset ANS2 in that state, everything breaks). Only stop the CPU if RTKit was running, and only do the reset dance if the CPU is stopped. Normal shutdown handoff: - RTKit not yet running - CPU detected not running - Reset - CPU powerup - RTKit boot wait ANS2 left running/idle: - RTKit not yet running - CPU detected running - RTKit wake message Sleep/resume cycle: - RTKit shutdown - CPU stopped - (sleep here) - CPU detected not running - Reset - CPU powerup - RTKit boot wait Shutdown or device removal: - RTKit shutdown - CPU stopped Therefore, the CPU running bit serves as a consistent flag of whether the coprocessor is fully stopped or just idle. Signed-off-by: Hector Martin Reviewed-by: Neal Gompa Reviewed-by: Sven Peter Signed-off-by: Alyssa Rosenzweig Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 53 ++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 7995f0776bc06..a060f69558e76 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1011,25 +1011,37 @@ static void apple_nvme_reset_work(struct work_struct *work) ret = apple_rtkit_shutdown(anv->rtk); if (ret) goto out; + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); } - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + /* + * Only do the soft-reset if the CPU is not running, which means either we + * or the previous stage shut it down cleanly. + */ + if (!(readl(anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL) & + APPLE_ANS_COPROC_CPU_CONTROL_RUN)) { - ret = reset_control_assert(anv->reset); - if (ret) - goto out; + ret = reset_control_assert(anv->reset); + if (ret) + goto out; - ret = apple_rtkit_reinit(anv->rtk); - if (ret) - goto out; + ret = apple_rtkit_reinit(anv->rtk); + if (ret) + goto out; - ret = reset_control_deassert(anv->reset); - if (ret) - goto out; + ret = reset_control_deassert(anv->reset); + if (ret) + goto out; + + writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, + anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + + ret = apple_rtkit_boot(anv->rtk); + } else { + ret = apple_rtkit_wake(anv->rtk); + } - writel(APPLE_ANS_COPROC_CPU_CONTROL_RUN, - anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); - ret = apple_rtkit_boot(anv->rtk); if (ret) { dev_err(anv->dev, "ANS did not boot"); goto out; @@ -1565,9 +1577,12 @@ static void apple_nvme_remove(struct platform_device *pdev) apple_nvme_disable(anv, true); nvme_uninit_ctrl(&anv->ctrl); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } + apple_nvme_detach_genpd(anv); } @@ -1576,8 +1591,11 @@ static void apple_nvme_shutdown(struct platform_device *pdev) struct apple_nvme *anv = platform_get_drvdata(pdev); apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { apple_rtkit_shutdown(anv->rtk); + + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } } static int apple_nvme_resume(struct device *dev) @@ -1594,10 +1612,11 @@ static int apple_nvme_suspend(struct device *dev) apple_nvme_disable(anv, true); - if (apple_rtkit_is_running(anv->rtk)) + if (apple_rtkit_is_running(anv->rtk)) { ret = apple_rtkit_shutdown(anv->rtk); - writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + writel(0, anv->mmio_coproc + APPLE_ANS_COPROC_CPU_CONTROL); + } return ret; } -- GitLab From ed83aff5a94e1d623c007159a6a7f1c3ef202c6c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 12 Feb 2025 12:01:15 +0100 Subject: [PATCH 734/989] s390: Update defconfigs Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 2 ++ arch/s390/configs/defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 44f01a4bc810f..80bdfbae6e5b4 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -469,6 +469,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -874,6 +875,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8bcd37edd3c97..449a0e996b963 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -459,6 +459,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -825,6 +826,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y -- GitLab From 173767c218cc1da74704e7863f165ac8a9796f3e Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 13 Feb 2025 21:16:14 +0000 Subject: [PATCH 735/989] s390/purgatory: Use -D__DISABLE_EXPORTS The object files in purgatory do not export symbols, so disable exports for all the object files, not only sha256.o, with -D__DISABLE_EXPORTS. This fixes a build failure with CONFIG_GENDWARFKSYMS, where we would otherwise attempt to calculate symbol versions for purgatory objects and fail because they're not built with debugging information: error: gendwarfksyms: process_module: dwarf_get_units failed: no debugging information? make[5]: *** [../scripts/Makefile.build:207: arch/s390/purgatory/string.o] Error 1 make[5]: *** Deleting file 'arch/s390/purgatory/string.o' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502120752.U3fOKScQ-lkp@intel.com/ Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20250213211614.3537605-2-samitolvanen@google.com Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/purgatory/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bdcf2a3b6c41b..bd39b36e7bd68 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -8,7 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY +CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) @@ -19,9 +19,11 @@ KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS += -D__DISABLE_EXPORTS # Since we link purgatory with -r unresolved symbols are not checked, so we # also link a purgatory.chk binary without -r to check for unresolved symbols. -- GitLab From c3a589fd9fcbf295a7402a4b188dc9277d505f4f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 18 Feb 2025 12:11:34 +0100 Subject: [PATCH 736/989] s390/boot: Fix ESSA detection The cmma_test_essa() inline assembly uses tmp as input and output, however tmp is specified as output only, which allows the compiler to optimize the initialization of tmp away. Therefore the ESSA detection may or may not work depending on previous contents of the register that the compiler selected for tmp. Fix this by using the correct constraint modifier. Fixes: 468a3bc2b7b9 ("s390/cmma: move parsing of cmma kernel parameter to early boot code") Cc: stable@vger.kernel.org Signed-off-by: Heiko Carstens Reviewed-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 885bd1dd2c82f..9276e0576d0ab 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -86,7 +86,7 @@ static int cmma_test_essa(void) : [reg1] "=&d" (reg1), [reg2] "=&a" (reg2), [rc] "+&d" (rc), - [tmp] "=&d" (tmp), + [tmp] "+&d" (tmp), "+Q" (get_lowcore()->program_new_psw), "=Q" (old) : [psw_old] "a" (&old), -- GitLab From 293f324ce96d700112c726682b14094d1b54e09c Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 13 Feb 2025 13:06:21 -0800 Subject: [PATCH 737/989] tools: Unify top-level quiet infrastructure Commit f2868b1a66d4f40f ("perf tools: Expose quiet/verbose variables in Makefile.perf") moved the quiet infrastructure out of tools/build/Makefile.build and into the top-level Makefile.perf file so that the quiet infrastructure could be used throughout perf and not just in Makefile.build. Extract out the quiet infrastructure into Makefile.include so that it can be leveraged outside of perf. Fixes: f2868b1a66d4f40f ("perf tools: Expose quiet/verbose variables in Makefile.perf") Reviewed-by: Jiri Olsa Signed-off-by: Charlie Jenkins Acked-by: Andrii Nakryiko Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Benjamin Tissoires Cc: Daniel Borkmann Cc: Daniel Lezcano Cc: Eduard Zingerman Cc: Hao Luo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Kosina Cc: John Fastabend Cc: Josh Poimboeuf Cc: KP Singh Cc: Lukasz Luba Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Mykola Lysenko Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Quentin Monnet Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Song Liu Cc: Stanislav Fomichev Cc: Steven Rostedt (VMware) Cc: Yonghong Song Cc: Zhang Rui Link: https://lore.kernel.org/r/20250213-quiet_tools-v3-1-07de4482a581@rivosinc.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/Makefile | 8 +------ tools/perf/Makefile.perf | 41 ---------------------------------- tools/scripts/Makefile.include | 30 +++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 48 deletions(-) diff --git a/tools/build/Makefile b/tools/build/Makefile index 18ad131f6ea74..63ef218787616 100644 --- a/tools/build/Makefile +++ b/tools/build/Makefile @@ -17,13 +17,7 @@ $(call allow-override,LD,$(CROSS_COMPILE)ld) export HOSTCC HOSTLD HOSTAR -ifeq ($(V),1) - Q = -else - Q = @ -endif - -export Q srctree CC LD +export srctree CC LD MAKEFLAGS := --no-print-directory build := -f $(srctree)/tools/build/Makefile.build dir=. obj diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 55d6ce9ea52fb..05c083bb11220 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -161,47 +161,6 @@ export VPATH SOURCE := $(shell ln -sf $(srctree)/tools/perf $(OUTPUT)/source) endif -# Beautify output -# --------------------------------------------------------------------------- -# -# Most of build commands in Kbuild start with "cmd_". You can optionally define -# "quiet_cmd_*". If defined, the short log is printed. Otherwise, no log from -# that command is printed by default. -# -# e.g.) -# quiet_cmd_depmod = DEPMOD $(MODLIB) -# cmd_depmod = $(srctree)/scripts/depmod.sh $(DEPMOD) $(KERNELRELEASE) -# -# A simple variant is to prefix commands with $(Q) - that's useful -# for commands that shall be hidden in non-verbose mode. -# -# $(Q)$(MAKE) $(build)=scripts/basic -# -# To put more focus on warnings, be less verbose as default -# Use 'make V=1' to see the full commands - -ifeq ($(V),1) - quiet = - Q = -else - quiet=quiet_ - Q=@ -endif - -# If the user is running make -s (silent mode), suppress echoing of commands -# make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. -ifeq ($(filter 3.%,$(MAKE_VERSION)),) -short-opts := $(firstword -$(MAKEFLAGS)) -else -short-opts := $(filter-out --%,$(MAKEFLAGS)) -endif - -ifneq ($(findstring s,$(short-opts)),) - quiet=silent_ -endif - -export quiet Q - # Do not use make's built-in rules # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 0aa4005017c72..45f4abef70640 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -136,6 +136,33 @@ else NO_SUBDIR = : endif +# Beautify output +# --------------------------------------------------------------------------- +# +# Most of build commands in Kbuild start with "cmd_". You can optionally define +# "quiet_cmd_*". If defined, the short log is printed. Otherwise, no log from +# that command is printed by default. +# +# e.g.) +# quiet_cmd_depmod = DEPMOD $(MODLIB) +# cmd_depmod = $(srctree)/scripts/depmod.sh $(DEPMOD) $(KERNELRELEASE) +# +# A simple variant is to prefix commands with $(Q) - that's useful +# for commands that shall be hidden in non-verbose mode. +# +# $(Q)$(MAKE) $(build)=scripts/basic +# +# To put more focus on warnings, be less verbose as default +# Use 'make V=1' to see the full commands + +ifeq ($(V),1) + quiet = + Q = +else + quiet = quiet_ + Q = @ +endif + # If the user is running make -s (silent mode), suppress echoing of commands # make-4.0 (and later) keep single letter options in the 1st word of MAKEFLAGS. ifeq ($(filter 3.%,$(MAKE_VERSION)),) @@ -146,8 +173,11 @@ endif ifneq ($(findstring s,$(short-opts)),) silent=1 + quiet=silent_ endif +export quiet Q + # # Define a callable command for descending to a new directory # -- GitLab From d403120cb9d4787b283ea202b2162f459d18fe9d Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 12 Feb 2025 14:30:58 -0500 Subject: [PATCH 738/989] ACPI: platform_profile: Fix memory leak in profile_class_is_visible() If class_find_device() finds a device, it's reference count is incremented. Call put_device() to drop this reference before returning. Fixes: 77be5cacb2c2 ("ACPI: platform_profile: Create class for ACPI platform profile") Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Link: https://patch.msgid.link/20250212193058.32110-1-kuurtb@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/platform_profile.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index fc92e43d0fe93..2ad53cc6aae53 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -417,8 +417,14 @@ static int profile_class_registered(struct device *dev, const void *data) static umode_t profile_class_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { - if (!class_find_device(&platform_profile_class, NULL, NULL, profile_class_registered)) + struct device *dev; + + dev = class_find_device(&platform_profile_class, NULL, NULL, profile_class_registered); + if (!dev) return 0; + + put_device(dev); + return attr->mode; } -- GitLab From 643f209ba3fdd4099416aaf9efa8266f7366d6fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Thu, 23 Jan 2025 12:22:04 -0800 Subject: [PATCH 739/989] drm/xe: Make GUC binaries dump consistent with other binaries in devcoredump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All other(hwsp, hwctx and vmas) binaries follow this format: [name].length: 0x1000 [name].data: xxxxxxx [name].error: errno The error one is just in case by some reason it was not able to capture the binary. So this GuC binaries should follow the same patern. v2: - renamed GUC binary to LOG Cc: John Harrison Cc: Lucas De Marchi Reviewed-by: Lucas De Marchi Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20250123202307.95103-3-jose.souza@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit cb1f868ca13756c0c18ba54d1591332476760d07) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_guc_ct.c | 6 ++++-- drivers/gpu/drm/xe/xe_guc_log.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 50c8076b51585..497036675a38c 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1723,9 +1723,11 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, drm_printf(p, "\tg2h outstanding: %d\n", snapshot->g2h_outstanding); - if (snapshot->ctb) - xe_print_blob_ascii85(p, "CTB data", '\n', + if (snapshot->ctb) { + drm_printf(p, "[CTB].length: 0x%lx\n", snapshot->ctb_size); + xe_print_blob_ascii85(p, "[CTB].data", '\n', snapshot->ctb, 0, snapshot->ctb_size); + } } else { drm_puts(p, "CT disabled\n"); } diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index 2baa4d95571fb..2457572ed86ad 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -208,10 +208,11 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_ drm_printf(p, "GuC timestamp: 0x%08llX [%llu]\n", snapshot->stamp, snapshot->stamp); drm_printf(p, "Log level: %u\n", snapshot->level); + drm_printf(p, "[LOG].length: 0x%lx\n", snapshot->size); remain = snapshot->size; for (i = 0; i < snapshot->num_chunks; i++) { size_t size = min(GUC_LOG_CHUNK_SIZE, remain); - const char *prefix = i ? NULL : "Log data"; + const char *prefix = i ? NULL : "[LOG].data"; char suffix = i == snapshot->num_chunks - 1 ? '\n' : 0; xe_print_blob_ascii85(p, prefix, suffix, snapshot->copy[i], 0, size); -- GitLab From 42367eca7604e16e170bd6bd94ef61ffdd335f4a Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Thu, 13 Feb 2025 13:06:22 -0800 Subject: [PATCH 740/989] tools: Remove redundant quiet setup Q is exported from Makefile.include so it is not necessary to manually set it. Reviewed-by: Jiri Olsa Signed-off-by: Charlie Jenkins Acked-by: Andrii Nakryiko Acked-by: Quentin Monnet Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Benjamin Tissoires Cc: Daniel Borkmann Cc: Daniel Lezcano Cc: Eduard Zingerman Cc: Hao Luo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Kosina Cc: John Fastabend Cc: Josh Poimboeuf Cc: KP Singh Cc: Lukasz Luba Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Mykola Lysenko Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Song Liu Cc: Stanislav Fomichev Cc: Steven Rostedt (VMware) Cc: Yonghong Song Cc: Zhang Rui Link: https://lore.kernel.org/r/20250213-quiet_tools-v3-2-07de4482a581@rivosinc.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/tools/Makefile | 6 ------ tools/bpf/Makefile | 6 ------ tools/bpf/bpftool/Documentation/Makefile | 6 ------ tools/bpf/bpftool/Makefile | 6 ------ tools/bpf/resolve_btfids/Makefile | 2 -- tools/bpf/runqslower/Makefile | 5 +---- tools/lib/bpf/Makefile | 13 ------------- tools/lib/perf/Makefile | 13 ------------- tools/lib/thermal/Makefile | 13 ------------- tools/objtool/Makefile | 6 ------ tools/testing/selftests/bpf/Makefile.docs | 6 ------ tools/testing/selftests/hid/Makefile | 2 -- tools/thermal/lib/Makefile | 13 ------------- tools/tracing/latency/Makefile | 6 ------ tools/tracing/rtla/Makefile | 6 ------ tools/verification/rv/Makefile | 6 ------ 16 files changed, 1 insertion(+), 114 deletions(-) diff --git a/tools/arch/arm64/tools/Makefile b/tools/arch/arm64/tools/Makefile index 7b42feedf6471..de4f1b66ef014 100644 --- a/tools/arch/arm64/tools/Makefile +++ b/tools/arch/arm64/tools/Makefile @@ -13,12 +13,6 @@ AWK ?= awk MKDIR ?= mkdir RM ?= rm -ifeq ($(V),1) -Q = -else -Q = @ -endif - arm64_tools_dir = $(top_srcdir)/arch/arm64/tools arm64_sysreg_tbl = $(arm64_tools_dir)/sysreg arm64_gen_sysreg = $(arm64_tools_dir)/gen-sysreg.awk diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 243b79f2b451e..062bbd6cd048e 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -27,12 +27,6 @@ srctree := $(patsubst %/,%,$(dir $(CURDIR))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -ifeq ($(V),1) - Q = -else - Q = @ -endif - FEATURE_USER = .bpf FEATURE_TESTS = libbfd disassembler-four-args disassembler-init-styled FEATURE_DISPLAY = libbfd diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile index 4315652678b9f..bf843f328812e 100644 --- a/tools/bpf/bpftool/Documentation/Makefile +++ b/tools/bpf/bpftool/Documentation/Makefile @@ -5,12 +5,6 @@ INSTALL ?= install RM ?= rm -f RMDIR ?= rmdir --ignore-fail-on-non-empty -ifeq ($(V),1) - Q = -else - Q = @ -endif - prefix ?= /usr/local mandir ?= $(prefix)/man man8dir = $(mandir)/man8 diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index dd9f3ec842017..6ea4823b770cb 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -7,12 +7,6 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) srctree := $(patsubst %/,%,$(dir $(srctree))) endif -ifeq ($(V),1) - Q = -else - Q = @ -endif - BPF_DIR = $(srctree)/tools/lib/bpf ifneq ($(OUTPUT),) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index 4b8079f294f65..afbddea3a39c6 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -5,10 +5,8 @@ include ../../scripts/Makefile.arch srctree := $(abspath $(CURDIR)/../../../) ifeq ($(V),1) - Q = msg = else - Q = @ ifeq ($(silent),1) msg = else diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index c4f1f1735af65..e49203ebd48c1 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -26,10 +26,7 @@ VMLINUX_BTF_PATHS := $(if $(O),$(O)/vmlinux) \ VMLINUX_BTF_PATH := $(or $(VMLINUX_BTF),$(firstword \ $(wildcard $(VMLINUX_BTF_PATHS)))) -ifeq ($(V),1) -Q = -else -Q = @ +ifneq ($(V),1) MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 endif diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 857a5f7b413d6..168140f8e6461 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -53,13 +53,6 @@ include $(srctree)/tools/scripts/Makefile.include # copy a bit from Linux kbuild -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - INCLUDES = -I$(or $(OUTPUT),.) \ -I$(srctree)/tools/include -I$(srctree)/tools/include/uapi \ -I$(srctree)/tools/arch/$(SRCARCH)/include @@ -96,12 +89,6 @@ override CFLAGS += $(CLANG_CROSS_FLAGS) # flags specific for shared library SHLIB_FLAGS := -DSHARED -fPIC -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Disable command line variables (CFLAGS) override from top # level Makefile (perf), otherwise build Makefile will get # the same command line setup. diff --git a/tools/lib/perf/Makefile b/tools/lib/perf/Makefile index 3a9b2140aa048..e9a7ac2c062e2 100644 --- a/tools/lib/perf/Makefile +++ b/tools/lib/perf/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - TEST_ARGS := $(if $(V),-v) # Set compile option CFLAGS diff --git a/tools/lib/thermal/Makefile b/tools/lib/thermal/Makefile index 8890fd57b110c..a1f5e388644d3 100644 --- a/tools/lib/thermal/Makefile +++ b/tools/lib/thermal/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Set compile option CFLAGS ifdef EXTRA_CFLAGS CFLAGS := $(EXTRA_CFLAGS) diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index f56e277275341..7a65948892e56 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -46,12 +46,6 @@ HOST_OVERRIDES := CC="$(HOSTCC)" LD="$(HOSTLD)" AR="$(HOSTAR)" AWK = awk MKDIR = mkdir -ifeq ($(V),1) - Q = -else - Q = @ -endif - BUILD_ORC := n ifeq ($(SRCARCH),x86) diff --git a/tools/testing/selftests/bpf/Makefile.docs b/tools/testing/selftests/bpf/Makefile.docs index eb6a4fea8c794..f7f9e7088bb38 100644 --- a/tools/testing/selftests/bpf/Makefile.docs +++ b/tools/testing/selftests/bpf/Makefile.docs @@ -7,12 +7,6 @@ INSTALL ?= install RM ?= rm -f RMDIR ?= rmdir --ignore-fail-on-non-empty -ifeq ($(V),1) - Q = -else - Q = @ -endif - prefix ?= /usr/local mandir ?= $(prefix)/man man2dir = $(mandir)/man2 diff --git a/tools/testing/selftests/hid/Makefile b/tools/testing/selftests/hid/Makefile index 0336353bd15f0..2839d2612ce3a 100644 --- a/tools/testing/selftests/hid/Makefile +++ b/tools/testing/selftests/hid/Makefile @@ -43,10 +43,8 @@ TEST_GEN_PROGS = hid_bpf hidraw # $3 - target (assumed to be file); only file name will be emitted; # $4 - optional extra arg, emitted as-is, if provided. ifeq ($(V),1) -Q = msg = else -Q = @ msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))"; MAKEFLAGS += --no-print-directory submake_extras := feature_display=0 diff --git a/tools/thermal/lib/Makefile b/tools/thermal/lib/Makefile index f2552f73a64c7..056d212f25cf5 100644 --- a/tools/thermal/lib/Makefile +++ b/tools/thermal/lib/Makefile @@ -39,19 +39,6 @@ libdir = $(prefix)/$(libdir_relative) libdir_SQ = $(subst ','\'',$(libdir)) libdir_relative_SQ = $(subst ','\'',$(libdir_relative)) -ifeq ("$(origin V)", "command line") - VERBOSE = $(V) -endif -ifndef VERBOSE - VERBOSE = 0 -endif - -ifeq ($(VERBOSE),1) - Q = -else - Q = @ -endif - # Set compile option CFLAGS ifdef EXTRA_CFLAGS CFLAGS := $(EXTRA_CFLAGS) diff --git a/tools/tracing/latency/Makefile b/tools/tracing/latency/Makefile index 6518b03e05c71..257a56b1899f2 100644 --- a/tools/tracing/latency/Makefile +++ b/tools/tracing/latency/Makefile @@ -37,12 +37,6 @@ FEATURE_TESTS += libtracefs FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(LATENCY-COLLECTOR) include $(srctree)/tools/build/Makefile.include diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index a6a7dee16622d..a1727c414e44b 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -37,12 +37,6 @@ FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs FEATURE_DISPLAY += libcpupower -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(RTLA) include $(srctree)/tools/build/Makefile.include diff --git a/tools/verification/rv/Makefile b/tools/verification/rv/Makefile index 411d62b3d8eb9..5b898360ba481 100644 --- a/tools/verification/rv/Makefile +++ b/tools/verification/rv/Makefile @@ -35,12 +35,6 @@ FEATURE_TESTS += libtracefs FEATURE_DISPLAY := libtraceevent FEATURE_DISPLAY += libtracefs -ifeq ($(V),1) - Q = -else - Q = @ -endif - all: $(RV) include $(srctree)/tools/build/Makefile.include -- GitLab From 213e24250feed3bcf58d7594298df2d7e78a88ab Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Tue, 28 Jan 2025 07:42:42 -0800 Subject: [PATCH 741/989] drm/xe/guc: Fix size_t print format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use %zx format to print size_t to remove the following warning when building for i386: >> drivers/gpu/drm/xe/xe_guc_ct.c:1727:43: warning: format specifies type 'unsigned long' but the argument has type 'size_t' (aka 'unsigned int') [-Wformat] 1727 | drm_printf(p, "[CTB].length: 0x%lx\n", snapshot->ctb_size); | ~~~ ^~~~~~~~~~~~~~~~~~ | %zx Cc: José Roberto de Souza Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501281627.H6nj184e-lkp@intel.com/ Fixes: 643f209ba3fd ("drm/xe: Make GUC binaries dump consistent with other binaries in devcoredump") Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250128154242.3371687-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 7748289df510638ba61fed86b59ce7d2fb4a194c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_guc_ct.c | 2 +- drivers/gpu/drm/xe/xe_guc_log.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 497036675a38c..72ad576fc18eb 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -1724,7 +1724,7 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, snapshot->g2h_outstanding); if (snapshot->ctb) { - drm_printf(p, "[CTB].length: 0x%lx\n", snapshot->ctb_size); + drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size); xe_print_blob_ascii85(p, "[CTB].data", '\n', snapshot->ctb, 0, snapshot->ctb_size); } diff --git a/drivers/gpu/drm/xe/xe_guc_log.c b/drivers/gpu/drm/xe/xe_guc_log.c index 2457572ed86ad..0ca3056d8bd3f 100644 --- a/drivers/gpu/drm/xe/xe_guc_log.c +++ b/drivers/gpu/drm/xe/xe_guc_log.c @@ -208,7 +208,7 @@ void xe_guc_log_snapshot_print(struct xe_guc_log_snapshot *snapshot, struct drm_ drm_printf(p, "GuC timestamp: 0x%08llX [%llu]\n", snapshot->stamp, snapshot->stamp); drm_printf(p, "Log level: %u\n", snapshot->level); - drm_printf(p, "[LOG].length: 0x%lx\n", snapshot->size); + drm_printf(p, "[LOG].length: 0x%zx\n", snapshot->size); remain = snapshot->size; for (i = 0; i < snapshot->num_chunks; i++) { size_t size = min(GUC_LOG_CHUNK_SIZE, remain); -- GitLab From 1fc61eeefe10d9996d2b875214d89f0909d03417 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Feb 2025 16:47:40 -0700 Subject: [PATCH 742/989] io_uring: fix spelling error in uapi io_uring.h This is obviously not that important, but when changes are synced back from the kernel to liburing, the codespell CI ends up erroring because of this misspelling. Let's just correct it and avoid this biting us again on an import. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e11c826385277..050fa8eb2e8f8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -380,7 +380,7 @@ enum io_uring_op { * result will be the number of buffers send, with * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers - * will be contigious from the starting buffer ID. + * will be contiguous from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- GitLab From f5da7c45188eea71394bf445655cae2df88a7788 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Feb 2025 15:29:05 -0800 Subject: [PATCH 743/989] tcp: adjust rcvq_space after updating scaling ratio Since commit under Fixes we set the window clamp in accordance to newly measured rcvbuf scaling_ratio. If the scaling_ratio decreased significantly we may put ourselves in a situation where windows become smaller than rcvq_space, preventing tcp_rcv_space_adjust() from increasing rcvbuf. The significant decrease of scaling_ratio is far more likely since commit 697a6c8cec03 ("tcp: increase the default TCP scaling ratio"), which increased the "default" scaling ratio from ~30% to 50%. Hitting the bad condition depends a lot on TCP tuning, and drivers at play. One of Meta's workloads hits it reliably under following conditions: - default rcvbuf of 125k - sender MTU 1500, receiver MTU 5000 - driver settles on scaling_ratio of 78 for the config above. Initial rcvq_space gets calculated as TCP_INIT_CWND * tp->advmss (10 * 5k = 50k). Once we find out the true scaling ratio and MSS we clamp the windows to 38k. Triggering the condition also depends on the message sequence of this workload. I can't repro the problem with simple iperf or TCP_RR-style tests. Fixes: a2cbb1603943 ("tcp: Update window clamping condition") Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250217232905.3162187-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb82e01da9110..98b8cc7403920 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -243,9 +243,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); -- GitLab From 5644c6b50ffee0a56c1e01430a8c88e34decb120 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Sun, 9 Feb 2025 23:22:35 -0800 Subject: [PATCH 744/989] bpf: skip non exist keys in generic_map_lookup_batch The generic_map_lookup_batch currently returns EINTR if it fails with ENOENT and retries several times on bpf_map_copy_value. The next batch would start from the same location, presuming it's a transient issue. This is incorrect if a map can actually have "holes", i.e. "get_next_key" can return a key that does not point to a valid value. At least the array of maps type may contain such holes legitly. Right now these holes show up, generic batch lookup cannot proceed any more. It will always fail with EINTR errors. Rather, do not retry in generic_map_lookup_batch. If it finds a non existing element, skip to the next key. This simple solution comes with a price that transient errors may not be recovered, and the iteration might cycle back to the first key under parallel deletion. For example, Hou Tao pointed out a following scenario: For LPM trie map: (1) ->map_get_next_key(map, prev_key, key) returns a valid key (2) bpf_map_copy_value() return -ENOMENT It means the key must be deleted concurrently. (3) goto next_key It swaps the prev_key and key (4) ->map_get_next_key(map, prev_key, key) again prev_key points to a non-existing key, for LPM trie it will treat just like prev_key=NULL case, the returned key will be duplicated. With the retry logic, the iteration can continue to the key next to the deleted one. But if we directly skip to the next key, the iteration loop would restart from the first key for the lpm_trie type. However, not all races may be recovered. For example, if current key is deleted after instead of before bpf_map_copy_value, or if the prev_key also gets deleted, then the loop will still restart from the first key for lpm_tire anyway. For generic lookup it might be better to stay simple, i.e. just skip to the next key. To guarantee that the output keys are not duplicated, it is better to implement map type specific batch operations, which can properly lock the trie and synchronize with concurrent mutators. Fixes: cb4d03ab499d ("bpf: Add generic support for lookup batch op") Closes: https://lore.kernel.org/bpf/Z6JXtA1M5jAZx8xD@debian.debian/ Signed-off-by: Yan Zhai Acked-by: Hou Tao Link: https://lore.kernel.org/r/85618439eea75930630685c467ccefeac0942e2b.1739171594.git.yan@cloudflare.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 14d6e99459d32..548ec1c46b787 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1977,8 +1977,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, return err; } -#define MAP_LOOKUP_RETRIES 3 - int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1988,8 +1986,8 @@ int generic_map_lookup_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; - int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; + int err; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -2035,14 +2033,8 @@ int generic_map_lookup_batch(struct bpf_map *map, err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); - if (err == -ENOENT) { - if (retry) { - retry--; - continue; - } - err = -EINTR; - break; - } + if (err == -ENOENT) + goto next_key; if (err) goto free_buf; @@ -2057,12 +2049,12 @@ int generic_map_lookup_batch(struct bpf_map *map, goto free_buf; } + cp++; +next_key: if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); - retry = MAP_LOOKUP_RETRIES; - cp++; cond_resched(); } -- GitLab From d66b7739176d513b81db8b18e8677e30f1b67574 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Sun, 9 Feb 2025 23:22:39 -0800 Subject: [PATCH 745/989] selftests: bpf: test batch lookup on array of maps with holes Iterating through array of maps may encounter non existing keys. The batch operation should not fail on when this happens. Signed-off-by: Yan Zhai Acked-by: Hou Tao Link: https://lore.kernel.org/r/9007237b9606dc2ee44465a4447fe46e13f3bea6.1739171594.git.yan@cloudflare.com Signed-off-by: Alexei Starovoitov --- .../bpf/map_tests/map_in_map_batch_ops.c | 62 +++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c index 66191ae9863c1..79c3ccadb9622 100644 --- a/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/map_in_map_batch_ops.c @@ -120,11 +120,12 @@ static void validate_fetch_results(int outer_map_fd, static void fetch_and_validate(int outer_map_fd, struct bpf_map_batch_opts *opts, - __u32 batch_size, bool delete_entries) + __u32 batch_size, bool delete_entries, + bool has_holes) { - __u32 *fetched_keys, *fetched_values, total_fetched = 0; - __u32 batch_key = 0, fetch_count, step_size; - int err, max_entries = OUTER_MAP_ENTRIES; + int err, max_entries = OUTER_MAP_ENTRIES - !!has_holes; + __u32 *fetched_keys, *fetched_values, total_fetched = 0, i; + __u32 batch_key = 0, fetch_count, step_size = batch_size; __u32 value_size = sizeof(__u32); /* Total entries needs to be fetched */ @@ -134,9 +135,8 @@ static void fetch_and_validate(int outer_map_fd, "Memory allocation failed for fetched_keys or fetched_values", "error=%s\n", strerror(errno)); - for (step_size = batch_size; - step_size <= max_entries; - step_size += batch_size) { + /* hash map may not always return full batch */ + for (i = 0; i < OUTER_MAP_ENTRIES; i++) { fetch_count = step_size; err = delete_entries ? bpf_map_lookup_and_delete_batch(outer_map_fd, @@ -155,6 +155,7 @@ static void fetch_and_validate(int outer_map_fd, if (err && errno == ENOSPC) { /* Fetch again with higher batch size */ total_fetched = 0; + step_size += batch_size; continue; } @@ -184,18 +185,19 @@ static void fetch_and_validate(int outer_map_fd, } static void _map_in_map_batch_ops(enum bpf_map_type outer_map_type, - enum bpf_map_type inner_map_type) + enum bpf_map_type inner_map_type, + bool has_holes) { + __u32 max_entries = OUTER_MAP_ENTRIES - !!has_holes; __u32 *outer_map_keys, *inner_map_fds; - __u32 max_entries = OUTER_MAP_ENTRIES; LIBBPF_OPTS(bpf_map_batch_opts, opts); __u32 value_size = sizeof(__u32); int batch_size[2] = {5, 10}; __u32 map_index, op_index; int outer_map_fd, ret; - outer_map_keys = calloc(max_entries, value_size); - inner_map_fds = calloc(max_entries, value_size); + outer_map_keys = calloc(OUTER_MAP_ENTRIES, value_size); + inner_map_fds = calloc(OUTER_MAP_ENTRIES, value_size); CHECK((!outer_map_keys || !inner_map_fds), "Memory allocation failed for outer_map_keys or inner_map_fds", "error=%s\n", strerror(errno)); @@ -209,6 +211,24 @@ static void _map_in_map_batch_ops(enum bpf_map_type outer_map_type, ((outer_map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ? 9 : 1000) - map_index; + /* This condition is only meaningful for array of maps. + * + * max_entries == OUTER_MAP_ENTRIES - 1 if it is true. Say + * max_entries is short for n, then outer_map_keys looks like: + * + * [n, n-1, ... 2, 1] + * + * We change it to + * + * [n, n-1, ... 2, 0] + * + * So it will leave key 1 as a hole. It will serve to test the + * correctness when batch on an array: a "non-exist" key might be + * actually allocated and returned from key iteration. + */ + if (has_holes) + outer_map_keys[max_entries - 1]--; + /* batch operation - map_update */ ret = bpf_map_update_batch(outer_map_fd, outer_map_keys, inner_map_fds, &max_entries, &opts); @@ -219,15 +239,17 @@ static void _map_in_map_batch_ops(enum bpf_map_type outer_map_type, /* batch operation - map_lookup */ for (op_index = 0; op_index < 2; ++op_index) fetch_and_validate(outer_map_fd, &opts, - batch_size[op_index], false); + batch_size[op_index], false, + has_holes); /* batch operation - map_lookup_delete */ if (outer_map_type == BPF_MAP_TYPE_HASH_OF_MAPS) fetch_and_validate(outer_map_fd, &opts, - max_entries, true /*delete*/); + max_entries, true /*delete*/, + has_holes); /* close all map fds */ - for (map_index = 0; map_index < max_entries; map_index++) + for (map_index = 0; map_index < OUTER_MAP_ENTRIES; map_index++) close(inner_map_fds[map_index]); close(outer_map_fd); @@ -237,16 +259,20 @@ static void _map_in_map_batch_ops(enum bpf_map_type outer_map_type, void test_map_in_map_batch_ops_array(void) { - _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_ARRAY); + _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_ARRAY, false); printf("%s:PASS with inner ARRAY map\n", __func__); - _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH); + _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH, false); printf("%s:PASS with inner HASH map\n", __func__); + _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_ARRAY, true); + printf("%s:PASS with inner ARRAY map with holes\n", __func__); + _map_in_map_batch_ops(BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH, true); + printf("%s:PASS with inner HASH map with holes\n", __func__); } void test_map_in_map_batch_ops_hash(void) { - _map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_ARRAY); + _map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_ARRAY, false); printf("%s:PASS with inner ARRAY map\n", __func__); - _map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_HASH); + _map_in_map_batch_ops(BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_HASH, false); printf("%s:PASS with inner HASH map\n", __func__); } -- GitLab From 4fa382be430421e1445f9c95c4dc9b7e0949ae8a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Feb 2025 14:43:44 -0800 Subject: [PATCH 746/989] scsi: ufs: core: Fix ufshcd_is_ufs_dev_busy() and ufshcd_eh_timed_out() ufshcd_is_ufs_dev_busy(), ufshcd_print_host_state() and ufshcd_eh_timed_out() are used in both modes (legacy mode and MCQ mode). hba->outstanding_reqs only represents the outstanding requests in legacy mode. Hence, change hba->outstanding_reqs into scsi_host_busy(hba->host) in these functions. Fixes: eacb139b77ff ("scsi: ufs: core: mcq: Enable multi-circular queue") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20250214224352.3025151-1-bvanassche@acm.org Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 1893a7ad95316..f9303e66bb798 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -266,7 +266,7 @@ static bool ufshcd_has_pending_tasks(struct ufs_hba *hba) static bool ufshcd_is_ufs_dev_busy(struct ufs_hba *hba) { - return hba->outstanding_reqs || ufshcd_has_pending_tasks(hba); + return scsi_host_busy(hba->host) || ufshcd_has_pending_tasks(hba); } static const struct ufs_dev_quirk ufs_fixups[] = { @@ -628,8 +628,8 @@ static void ufshcd_print_host_state(struct ufs_hba *hba) const struct scsi_device *sdev_ufs = hba->ufs_device_wlun; dev_err(hba->dev, "UFS Host state=%d\n", hba->ufshcd_state); - dev_err(hba->dev, "outstanding reqs=0x%lx tasks=0x%lx\n", - hba->outstanding_reqs, hba->outstanding_tasks); + dev_err(hba->dev, "%d outstanding reqs, tasks=0x%lx\n", + scsi_host_busy(hba->host), hba->outstanding_tasks); dev_err(hba->dev, "saved_err=0x%x, saved_uic_err=0x%x\n", hba->saved_err, hba->saved_uic_err); dev_err(hba->dev, "Device power mode=%d, UIC link state=%d\n", @@ -8882,7 +8882,7 @@ static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) dev_info(hba->dev, "%s() finished; outstanding_tasks = %#lx.\n", __func__, hba->outstanding_tasks); - return hba->outstanding_reqs ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; + return scsi_host_busy(hba->host) ? SCSI_EH_RESET_TIMER : SCSI_EH_DONE; } static const struct attribute_group *ufshcd_driver_groups[] = { -- GitLab From 415cadd505464d9a11ff5e0f6e0329c127849da5 Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Fri, 14 Feb 2025 14:43:59 -0800 Subject: [PATCH 747/989] gve: set xdp redirect target only when it is available Before this patch the NETDEV_XDP_ACT_NDO_XMIT XDP feature flag is set by default as part of driver initialization, and is never cleared. However, this flag differs from others in that it is used as an indicator for whether the driver is ready to perform the ndo_xdp_xmit operation as part of an XDP_REDIRECT. Kernel helpers xdp_features_(set|clear)_redirect_target exist to convey this meaning. This patch ensures that the netdev is only reported as a redirect target when XDP queues exist to forward traffic. Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format") Cc: stable@vger.kernel.org Reviewed-by: Praveen Kaligineedi Reviewed-by: Jeroen de Borst Signed-off-by: Joshua Washington Link: https://patch.msgid.link/20250214224417.1237818-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve.h | 10 ++++++++++ drivers/net/ethernet/google/gve/gve_main.c | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 8167cc5fb0df1..78d2a19593d18 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1116,6 +1116,16 @@ static inline u32 gve_xdp_tx_start_queue_id(struct gve_priv *priv) return gve_xdp_tx_queue_id(priv, 0); } +static inline bool gve_supports_xdp_xmit(struct gve_priv *priv) +{ + switch (priv->queue_format) { + case GVE_GQI_QPL_FORMAT: + return true; + default: + return false; + } +} + /* gqi napi handler defined in gve_main.c */ int gve_napi_poll(struct napi_struct *napi, int budget); diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 533e659b15b31..92237fb0b60c1 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1903,6 +1903,8 @@ static void gve_turndown(struct gve_priv *priv) /* Stop tx queues */ netif_tx_disable(priv->dev); + xdp_features_clear_redirect_target(priv->dev); + gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); @@ -1972,6 +1974,9 @@ static void gve_turnup(struct gve_priv *priv) napi_schedule(&block->napi); } + if (priv->num_xdp_queues && gve_supports_xdp_xmit(priv)) + xdp_features_set_redirect_target(priv->dev, false); + gve_set_napi_enabled(priv); } @@ -2246,7 +2251,6 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) if (priv->queue_format == GVE_GQI_QPL_FORMAT) { xdp_features = NETDEV_XDP_ACT_BASIC; xdp_features |= NETDEV_XDP_ACT_REDIRECT; - xdp_features |= NETDEV_XDP_ACT_NDO_XMIT; xdp_features |= NETDEV_XDP_ACT_XSK_ZEROCOPY; } else { xdp_features = 0; -- GitLab From 2f56be7f52ece7fc8c16a58ca9683f0a73e288e1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 15 Feb 2025 08:26:46 -0800 Subject: [PATCH 748/989] MAINTAINERS: trim the GVE entry We requested in the past that GVE patches coming out of Google should be submitted only by GVE maintainers. There were too many patches posted which didn't follow the subsystem guidance. Recently Joshua was added to maintainers, but even tho he was asked to follow the netdev "FAQ" in the past [1] he does not follow the local customs. It is not reasonable for a person who hasn't read the maintainer entry for the subsystem to be a driver maintainer. We can re-add once Joshua does some on-list reviews to prove the fluency with the upstream process. Link: https://lore.kernel.org/20240610172720.073d5912@kernel.org # [1] Link: https://patch.msgid.link/20250215162646.2446559-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1405ebe703a8f..0bfcbe6a74ea7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9834,7 +9834,6 @@ F: drivers/input/touchscreen/goodix* GOOGLE ETHERNET DRIVERS M: Jeroen de Borst -M: Joshua Washington M: Harshitha Ramamurthy L: netdev@vger.kernel.org S: Maintained -- GitLab From f6093c5ec74d5cc495f89bd359253d9c738d04d9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 17 Feb 2025 14:48:11 +0100 Subject: [PATCH 749/989] net: pse-pd: pd692x0: Fix power limit retrieval Fix incorrect data offset read in the pd692x0_pi_get_pw_limit callback. The issue was previously unnoticed as it was only used by the regulator API and not thoroughly tested, since the PSE is mainly controlled via ethtool. The function became actively used by ethtool after commit 3e9dbfec4998 ("net: pse-pd: Split ethtool_get_status into multiple callbacks"), which led to the discovery of this issue. Fix it by using the correct data offset. Fixes: a87e699c9d33 ("net: pse-pd: pd692x0: Enhance with new current limit and voltage read callbacks") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250217134812.1925345-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index fc9e23927b3b7..7d60a714ca536 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1047,7 +1047,7 @@ static int pd692x0_pi_get_pw_limit(struct pse_controller_dev *pcdev, if (ret < 0) return ret; - return pd692x0_pi_get_pw_from_table(buf.data[2], buf.data[3]); + return pd692x0_pi_get_pw_from_table(buf.data[0], buf.data[1]); } static int pd692x0_pi_set_pw_limit(struct pse_controller_dev *pcdev, -- GitLab From e57a6320215c3967f51ab0edeff87db2095440e4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:27 -0800 Subject: [PATCH 750/989] net: Add net_passive_inc() and net_passive_dec(). net_drop_ns() is NULL when CONFIG_NET_NS is disabled. The next patch introduces a function that increments and decrements net->passive. As a prep, let's rename and export net_free() to net_passive_dec() and add net_passive_inc(). Suggested-by: Eric Dumazet Link: https://lore.kernel.org/netdev/CANn89i+oUCt2VGvrbrweniTendZFEh+nwS=uonc004-aPkWy-Q@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 11 +++++++++++ net/core/net_namespace.c | 8 ++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 7ba1402ca7796..f467a66abc6b1 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -297,6 +297,7 @@ static inline int check_net(const struct net *net) } void net_drop_ns(void *); +void net_passive_dec(struct net *net); #else @@ -326,8 +327,18 @@ static inline int check_net(const struct net *net) } #define net_drop_ns NULL + +static inline void net_passive_dec(struct net *net) +{ + refcount_dec(&net->passive); +} #endif +static inline void net_passive_inc(struct net *net) +{ + refcount_inc(&net->passive); +} + /* Returns true if the netns initialization is completed successfully */ static inline bool net_initialized(const struct net *net) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb39a12b2f829..4303f2a492624 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -464,7 +464,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +482,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +523,7 @@ struct net *copy_net_ns(unsigned long flags, key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } cleanup_net_task = NULL; } -- GitLab From 65161fb544aada499c912b6010a8f7d8e04f6130 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:28 -0800 Subject: [PATCH 751/989] net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net(). After the cited commit, dev_net(dev) is fetched before holding RTNL and passed to __unregister_netdevice_notifier_net(). However, dev_net(dev) might be different after holding RTNL. In the reported case [0], while removing a VF device, its netns was being dismantled and the VF was moved to init_net. So the following sequence is basically illegal when dev was fetched without lookup: net = dev_net(dev); rtnl_net_lock(net); Let's use a new helper rtnl_net_dev_lock() to fix the race. It fetches dev_net_rcu(dev), bumps its net->passive, and checks if dev_net_rcu(dev) is changed after rtnl_net_lock(). [0]: BUG: KASAN: slab-use-after-free in notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) Read of size 8 at addr ffff88810cefb4c8 by task test-bridge-lag/21127 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) print_report (mm/kasan/report.c:379 mm/kasan/report.c:489) kasan_report (mm/kasan/report.c:604) notifier_call_chain (kernel/notifier.c:75 (discriminator 2)) call_netdevice_notifiers_info (net/core/dev.c:2011) unregister_netdevice_many_notify (net/core/dev.c:11551) unregister_netdevice_queue (net/core/dev.c:11487) unregister_netdev (net/core/dev.c:11635) mlx5e_remove (drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6552 drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6579) mlx5_core auxiliary_bus_remove (drivers/base/auxiliary.c:230) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) bus_remove_device (./include/linux/kobject.h:193 drivers/base/base.h:73 drivers/base/bus.c:583) device_del (drivers/base/power/power.h:142 drivers/base/core.c:3855) mlx5_rescan_drivers_locked (./include/linux/auxiliary_bus.h:241 drivers/net/ethernet/mellanox/mlx5/core/dev.c:333 drivers/net/ethernet/mellanox/mlx5/core/dev.c:535 drivers/net/ethernet/mellanox/mlx5/core/dev.c:549) mlx5_core mlx5_unregister_device (drivers/net/ethernet/mellanox/mlx5/core/dev.c:468) mlx5_core mlx5_uninit_one (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 drivers/net/ethernet/mellanox/mlx5/core/main.c:1563) mlx5_core remove_one (drivers/net/ethernet/mellanox/mlx5/core/main.c:965 drivers/net/ethernet/mellanox/mlx5/core/main.c:2019) mlx5_core pci_device_remove (./include/linux/pm_runtime.h:129 drivers/pci/pci-driver.c:475) device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296) unbind_store (drivers/base/bus.c:245) kernfs_fop_write_iter (fs/kernfs/file.c:338) vfs_write (fs/read_write.c:587 (discriminator 1) fs/read_write.c:679 (discriminator 1)) ksys_write (fs/read_write.c:732) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f6a4d5018b7 Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().") Reported-by: Yael Chemla Closes: https://lore.kernel.org/netdev/146eabfe-123c-4970-901e-e961b4c09bc3@nvidia.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index b91658e8aedb4..19e268568282a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2070,6 +2070,42 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -2077,6 +2113,11 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct net *net = dev_net(dev); int err; + /* rtnl_net_lock() assumes dev is not yet published by + * register_netdevice(). + */ + DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); if (!err) { @@ -2093,13 +2134,12 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(net, nb); - rtnl_net_unlock(net); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); return err; } -- GitLab From d4c6bfc83936cb61fac99e9891c406fbdd40f964 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 11:11:29 -0800 Subject: [PATCH 752/989] dev: Use rtnl_net_dev_lock() in unregister_netdev(). The following sequence is basically illegal when dev was fetched without lookup because dev_net(dev) might be different after holding rtnl_net_lock(): net = dev_net(dev); rtnl_net_lock(net); Let's use rtnl_net_dev_lock() in unregister_netdev(). Note that there is no real bug in unregister_netdev() for now because RTNL protects the scope even if dev_net(dev) is changed before/after RTNL. Fixes: 00fb9823939e ("dev: Hold per-netns RTNL in (un)?register_netdev().") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250217191129.19967-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 19e268568282a..fafd2f4b5d5d7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11920,11 +11920,9 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - struct net *net = dev_net(dev); - - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -- GitLab From eff2eb592efd73f00590d578c3d6021f604df62c Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Wed, 19 Feb 2025 17:48:07 +1100 Subject: [PATCH 753/989] cxl: Fix cross-reference in documentation and add deprecation warning commit 5731d41af924 ("cxl: Deprecate driver") labelled the cxl driver as deprecated and moved the ABI documentation to the obsolete/ subdirectory, but didn't update cxl.rst, causing a warning once ff7ff6eb4f809 ("docs: media: Allow creating cross-references for RC ABI") was merged. Fix the cross-reference, and also add a deprecation warning. Fixes: 5731d41af924 ("cxl: Deprecate driver") Reported-by: Bagas Sanjaya Signed-off-by: Andrew Donnellan Acked-by: Bagas Sanjaya Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250219064807.175107-1-ajd@linux.ibm.com --- Documentation/arch/powerpc/cxl.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/powerpc/cxl.rst b/Documentation/arch/powerpc/cxl.rst index d2d77057610e4..778adda740d24 100644 --- a/Documentation/arch/powerpc/cxl.rst +++ b/Documentation/arch/powerpc/cxl.rst @@ -18,6 +18,7 @@ Introduction both access system memory directly and with the same effective addresses. + **This driver is deprecated and will be removed in a future release.** Hardware overview ================= @@ -453,7 +454,7 @@ Sysfs Class A cxl sysfs class is added under /sys/class/cxl to facilitate enumeration and tuning of the accelerators. Its layout is - described in Documentation/ABI/testing/sysfs-class-cxl + described in Documentation/ABI/obsolete/sysfs-class-cxl Udev rules -- GitLab From 8821f36333e27c8355d4a730649923f938e1e4b9 Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Mon, 27 Jan 2025 16:27:52 +0100 Subject: [PATCH 754/989] cgroup/dmem: Don't open-code css_for_each_descendant_pre The current implementation has a bug: If the current css doesn't contain any pool that is a descendant of the "pool" (i.e. when found_descendant == false), then "pool" will point to some unrelated pool. If the current css has a child, we'll overwrite parent_pool with this unrelated pool on the next iteration. Since we can just check whether a pool refers to the same region to determine whether or not it's related, all the additional pool tracking is unnecessary, so just switch to using css_for_each_descendant_pre for traversal. Fixes: b168ed458dde ("kernel/cgroup: Add "dmem" memory accounting cgroup") Signed-off-by: Friedrich Vock Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20250127152754.21325-1-friedrich.vock@gmx.de Signed-off-by: Maarten Lankhorst --- kernel/cgroup/dmem.c | 50 ++++++++++---------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index fbe34299673d3..10b63433f0573 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -220,60 +220,32 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool) { struct page_counter *climit; - struct cgroup_subsys_state *css, *next_css; + struct cgroup_subsys_state *css; struct dmemcg_state *dmemcg_iter; - struct dmem_cgroup_pool_state *pool, *parent_pool; - bool found_descendant; + struct dmem_cgroup_pool_state *pool, *found_pool; climit = &limit_pool->cnt; rcu_read_lock(); - parent_pool = pool = limit_pool; - css = &limit_pool->cs->css; - /* - * This logic is roughly equivalent to css_foreach_descendant_pre, - * except we also track the parent pool to find out which pool we need - * to calculate protection values for. - * - * We can stop the traversal once we find test_pool among the - * descendants since we don't really care about any others. - */ - while (pool != test_pool) { - next_css = css_next_child(NULL, css); - if (next_css) { - parent_pool = pool; - } else { - while (css != &limit_pool->cs->css) { - next_css = css_next_child(css, css->parent); - if (next_css) - break; - css = css->parent; - parent_pool = pool_parent(parent_pool); - } - /* - * We can only hit this when test_pool is not a - * descendant of limit_pool. - */ - if (WARN_ON_ONCE(css == &limit_pool->cs->css)) - break; - } - css = next_css; - - found_descendant = false; + css_for_each_descendant_pre(css, &limit_pool->cs->css) { dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { - if (pool_parent(pool) == parent_pool) { - found_descendant = true; + if (pool->region == limit_pool->region) { + found_pool = pool; break; } } - if (!found_descendant) + if (!found_pool) continue; page_counter_calculate_protection( - climit, &pool->cnt, true); + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; } rcu_read_unlock(); } -- GitLab From 3dbc0215e3c502a9f3221576da0fdc9847fb9721 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Tue, 18 Feb 2025 03:28:03 -0600 Subject: [PATCH 755/989] drm/nouveau/pmu: Fix gp10b firmware guard Most kernel configs enable multiple Tegra SoC generations, causing this typo to go unnoticed. But in the case where a kernel config is strictly for Tegra186, this is a problem. Fixes: 989863d7cbe5 ("drm/nouveau/pmu: select implementation based on available firmware") Signed-off-by: Aaron Kling Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20250218-nouveau-gm10b-guard-v2-1-a4de71500d48@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c index a6f410ba60bc9..d393bc540f862 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pmu/gp10b.c @@ -75,7 +75,7 @@ gp10b_pmu_acr = { .bootstrap_multiple_falcons = gp10b_pmu_acr_bootstrap_multiple_falcons, }; -#if IS_ENABLED(CONFIG_ARCH_TEGRA_210_SOC) +#if IS_ENABLED(CONFIG_ARCH_TEGRA_186_SOC) MODULE_FIRMWARE("nvidia/gp10b/pmu/desc.bin"); MODULE_FIRMWARE("nvidia/gp10b/pmu/image.bin"); MODULE_FIRMWARE("nvidia/gp10b/pmu/sig.bin"); -- GitLab From 6586788f0a8d0f3b33b1383885575f5b5f7b9dad Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 15 Feb 2025 08:37:53 +0100 Subject: [PATCH 756/989] MAINTAINERS: Remove myself I was pondering with myself for a while if I should just make it official that I'm not really involved in the kernel community anymore, neither as a reviewer, nor as a maintainer. Most of the time I simply excused myself with "if something urgent comes up, I can chime in and help out". Lyude and Danilo are doing a wonderful job and I've put all my trust into them. However, there is one thing I can't stand and it's hurting me the most. I'm convinced, no, my core believe is, that inclusivity and respect, working with others as equals, no power plays involved, is how we should work together within the Free and Open Source community. I can understand maintainers needing to learn, being concerned on technical points. Everybody deserves the time to understand and learn. It is my true belief that most people are capable of change eventually. I truly believe this community can change from within, however this doesn't mean it's going to be a smooth process. The moment I made up my mind about this was reading the following words written by a maintainer within the kernel community: "we are the thin blue line" This isn't okay. This isn't creating an inclusive environment. This isn't okay with the current political situation especially in the US. A maintainer speaking those words can't be kept. No matter how important or critical or relevant they are. They need to be removed until they learn. Learn what those words mean for a lot of marginalized people. Learn about what horrors it evokes in their minds. I can't in good faith remain to be part of a project and its community where those words are tolerated. Those words are not technical, they are a political statement. Even if unintentionally, such words carry power, they carry meanings one needs to be aware of. They do cause an immense amount of harm. I wish the best of luck for everybody to continue to try to work from within. You got my full support and I won't hold it against anybody trying to improve the community, it's a thankless job, it's a lot of work. People will continue to burn out. I got burned out enough by myself caring about the bits I maintained, but eventually I had to realize my limits. The obligation I felt was eating me from inside. It stopped being fun at some point and I reached a point where I simply couldn't continue the work I was so motivated doing as I've did in the early days. Please respect my wishes and put this statement as is into the tree. Leaving anything out destroys its entire meaning. Respectfully Karol Signed-off-by: Karol Herbst Acked-by: Masami Hiramatsu (Google) Acked-by: Steven Rostedt (Google) Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20250215073753.1217002-2-kherbst@redhat.com --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fd4f528bc4145..18ade2ea4f3c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7354,7 +7354,6 @@ F: Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml F: drivers/gpu/drm/panel/panel-novatek-nt36672a.c DRM DRIVER FOR NVIDIA GEFORCE/QUADRO GPUS -M: Karol Herbst M: Lyude Paul M: Danilo Krummrich L: dri-devel@lists.freedesktop.org @@ -23819,7 +23818,6 @@ F: tools/testing/selftests/ftrace/ TRACING MMIO ACCESSES (MMIOTRACE) M: Steven Rostedt M: Masami Hiramatsu -R: Karol Herbst R: Pekka Paalanen L: linux-kernel@vger.kernel.org L: nouveau@lists.freedesktop.org -- GitLab From 60255f3704fde70ed3c4d62f919aa4b46f841f70 Mon Sep 17 00:00:00 2001 From: Niravkumar L Rabara Date: Sun, 16 Feb 2025 11:15:36 +0800 Subject: [PATCH 757/989] mtd: rawnand: cadence: fix unchecked dereference Add NULL check before variable dereference to fix static checker warning. Fixes: d76d22b5096c ("mtd: rawnand: cadence: use dma_map_resource for sdma address") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/e448a22c-bada-448d-9167-7af71305130d@stanley.mountain/ Signed-off-by: Niravkumar L Rabara Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/cadence-nand-controller.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c index 0b2db4173e723..6667eea955977 100644 --- a/drivers/mtd/nand/raw/cadence-nand-controller.c +++ b/drivers/mtd/nand/raw/cadence-nand-controller.c @@ -2972,8 +2972,10 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl) static void cadence_nand_remove(struct cdns_nand_ctrl *cdns_ctrl) { cadence_nand_chips_cleanup(cdns_ctrl); - dma_unmap_resource(cdns_ctrl->dmac->device->dev, cdns_ctrl->io.iova_dma, - cdns_ctrl->io.size, DMA_BIDIRECTIONAL, 0); + if (cdns_ctrl->dmac) + dma_unmap_resource(cdns_ctrl->dmac->device->dev, + cdns_ctrl->io.iova_dma, cdns_ctrl->io.size, + DMA_BIDIRECTIONAL, 0); cadence_nand_irq_cleanup(cdns_ctrl->irq, cdns_ctrl); kfree(cdns_ctrl->buf); dma_free_coherent(cdns_ctrl->dev, sizeof(struct cadence_nand_cdma_desc), -- GitLab From fb3331f53e3cb1f1505f918f4f33bb0a3a231e4f Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 18 Feb 2025 20:34:43 -0700 Subject: [PATCH 758/989] io_uring/rsrc: remove unused constants IO_NODE_ALLOC_CACHE_MAX has been unused since commit fbbb8e991d86 ("io_uring/rsrc: get rid of io_rsrc_node allocation cache") removed the rsrc_node_cache. IO_RSRC_TAG_TABLE_SHIFT and IO_RSRC_TAG_TABLE_MASK have been unused since commit 7029acd8a950 ("io_uring/rsrc: get rid of per-ring io_rsrc_node list") removed the separate tag table for registered nodes. Signed-off-by: Caleb Sander Mateos Reviewed-by: Li Zetao Link: https://lore.kernel.org/r/20250219033444.2020136-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 190f7ee45de93..89ea0135a1a0d 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,12 +4,6 @@ #include -#define IO_NODE_ALLOC_CACHE_MAX 32 - -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, -- GitLab From 7330195e6018ece3e886177ffbc9349a0b6585e6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 11 Feb 2025 20:51:25 +1030 Subject: [PATCH 759/989] smb: client, common: Avoid multiple -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. So, in order to avoid ending up with flexible-array members in the middle of other structs, we use the `__struct_group()` helper to separate the flexible arrays from the rest of the members in the flexible structures. We then use the newly created tagged `struct smb2_file_link_info_hdr` and `struct smb2_file_rename_info_hdr` to replace the type of the objects causing trouble: `rename_info` and `link_info` in `struct smb2_compound_vars`. We also want to ensure that when new members need to be added to the flexible structures, they are always included within the newly created tagged structs. For this, we use `static_assert()`. This ensures that the memory layout for both the flexible structure and the new tagged struct is the same after any changes. So, with these changes, fix 86 of the following warnings: fs/smb/client/cifsglob.h:2335:36: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] fs/smb/client/cifsglob.h:2334:38: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 4 ++-- fs/smb/common/smb2pdu.h | 30 ++++++++++++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 4bdd6a43e5215..bc06b8ae2ebd4 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2324,8 +2324,8 @@ struct smb2_compound_vars { struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; struct kvec close_iov; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; + struct smb2_file_rename_info_hdr rename_info; + struct smb2_file_link_info_hdr link_info; struct kvec ea_iov; }; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 3336df2ea5d4a..c7a0efda44036 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1707,23 +1707,33 @@ struct smb2_file_internal_info { } __packed; /* level 6 Query */ struct smb2_file_rename_info { /* encoding of request for level 10 */ - __u8 ReplaceIfExists; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_rename_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* New name to be assigned */ /* padding - overall struct size must be >= 24 so filename + pad >= 6 */ } __packed; /* level 10 Set */ +static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr), + "struct member likely outside of __struct_group()"); struct smb2_file_link_info { /* encoding of request for level 11 */ - __u8 ReplaceIfExists; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - __u8 Reserved[7]; - __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ - __le32 FileNameLength; + /* New members MUST be added within the struct_group() macro below. */ + __struct_group(smb2_file_link_info_hdr, __hdr, __packed, + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + ); char FileName[]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr), + "struct member likely outside of __struct_group()"); /* * This level 18, although with struct with same name is different from cifs -- GitLab From 9df23801c83d3e12b4c09be39d37d2be385e52f9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 16 Feb 2025 22:17:54 -0600 Subject: [PATCH 760/989] smb311: failure to open files of length 1040 when mounting with SMB3.1.1 POSIX extensions If a file size has bits 0x410 = ATTR_DIRECTORY | ATTR_REPARSE set then during queryinfo (stat) the file is regarded as a directory and subsequent opens can fail. A simple test example is trying to open any file 1040 bytes long when mounting with "posix" (SMB3.1.1 POSIX/Linux Extensions). The cause of this bug is that Attributes field in smb2_file_all_info struct occupies the same place that EndOfFile field in smb311_posix_qinfo, and sometimes the latter struct is incorrectly processed as if it was the first one. Reported-by: Oleh Nykyforchyn Tested-by: Oleh Nykyforchyn Acked-by: Paulo Alcantara (Red Hat) Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 1 + fs/smb/client/reparse.h | 28 ++++++++++++++++++++++------ fs/smb/client/smb2inode.c | 4 ++++ fs/smb/client/smb2ops.c | 3 ++- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index bc06b8ae2ebd4..cddeb2adbf4af 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -253,6 +253,7 @@ struct cifs_cred { struct cifs_open_info_data { bool adjust_tz; bool reparse_point; + bool contains_posix_file_info; struct { /* ioctl response buffer */ struct { diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index 5a753fec7e2c2..c0be5ab45a78a 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -99,14 +99,30 @@ static inline bool reparse_inode_match(struct inode *inode, static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data) { - struct smb2_file_all_info *fi = &data->fi; - u32 attrs = le32_to_cpu(fi->Attributes); + u32 attrs; bool ret; - ret = data->reparse_point || (attrs & ATTR_REPARSE); - if (ret) - attrs |= ATTR_REPARSE; - fi->Attributes = cpu_to_le32(attrs); + if (data->contains_posix_file_info) { + struct smb311_posix_qinfo *fi = &data->posix_fi; + + attrs = le32_to_cpu(fi->DosAttributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->DosAttributes = cpu_to_le32(attrs); + } + + } else { + struct smb2_file_all_info *fi = &data->fi; + + attrs = le32_to_cpu(fi->Attributes); + if (data->reparse_point) { + attrs |= ATTR_REPARSE; + fi->Attributes = cpu_to_le32(attrs); + } + } + + ret = attrs & ATTR_REPARSE; + return ret; } diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 5dfb30b0a852c..826b57a5a2a8d 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -650,6 +650,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, switch (cmds[i]) { case SMB2_OP_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -673,6 +674,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, break; case SMB2_OP_POSIX_QUERY_INFO: idata = in_iov[i].iov_base; + idata->contains_posix_file_info = true; if (rc == 0 && cfile && cfile->symlink_target) { idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!idata->symlink_target) @@ -770,6 +772,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, idata = in_iov[i].iov_base; idata->reparse.io.iov = *iov; idata->reparse.io.buftype = resp_buftype[i + 1]; + idata->contains_posix_file_info = false; /* BB VERIFY */ rbuf = reparse_buf_ptr(iov); if (IS_ERR(rbuf)) { rc = PTR_ERR(rbuf); @@ -791,6 +794,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, case SMB2_OP_QUERY_WSL_EA: if (!rc) { idata = in_iov[i].iov_base; + idata->contains_posix_file_info = false; qi_rsp = rsp_iov[i + 1].iov_base; data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset); size[0] = le32_to_cpu(qi_rsp->OutputBufferLength); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ec36bed54b0b9..23e0c8be7fb52 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1001,6 +1001,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, if (!data->symlink_target) return -ENOMEM; } + data->contains_posix_file_info = false; return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi); } @@ -5146,7 +5147,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, FILE_CREATE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, ACL_NO_MODE); oparms.fid = &fid; - + idata.contains_posix_file_info = false; rc = server->ops->open(xid, &oparms, &oplock, &idata); if (rc) goto out; -- GitLab From cad3fc0a4c8cef07b07ceddc137f582267577250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 18 Sep 2024 00:16:05 +0200 Subject: [PATCH 761/989] cifs: Throw -EOPNOTSUPP error on unsupported reparse point type from parse_reparse_point() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This would help to track and detect by caller if the reparse point type was processed or not. Signed-off-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/reparse.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 0a5a52a8a7dd1..2b9e9885dc425 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -1088,13 +1088,12 @@ int parse_reparse_point(struct reparse_data_buffer *buf, le32_to_cpu(buf->ReparseTag)); return -EIO; } - break; + return 0; default: cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", le32_to_cpu(buf->ReparseTag)); - break; + return -EOPNOTSUPP; } - return 0; } int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, -- GitLab From b587fd128660d48cd2122f870f720ff8e2b4abb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 18 Sep 2024 00:28:25 +0200 Subject: [PATCH 762/989] cifs: Treat unhandled directory name surrogate reparse points as mount directory nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the reparse point was not handled (indicated by the -EOPNOTSUPP from ops->parse_reparse_point() call) but reparse tag is of type name surrogate directory type, then treat is as a new mount point. Name surrogate reparse point represents another named entity in the system. From SMB client point of view, this another entity is resolved on the SMB server, and server serves its content automatically. Therefore from Linux client point of view, this name surrogate reparse point of directory type crosses mount point. Signed-off-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/inode.c | 13 +++++++++++++ fs/smb/common/smbfsctl.h | 3 +++ 2 files changed, 16 insertions(+) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 3261190e6f903..616149c7f0a54 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1215,6 +1215,19 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, rc = server->ops->parse_reparse_point(cifs_sb, full_path, iov, data); + /* + * If the reparse point was not handled but it is the + * name surrogate which points to directory, then treat + * is as a new mount point. Name surrogate reparse point + * represents another named entity in the system. + */ + if (rc == -EOPNOTSUPP && + IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) && + (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) { + rc = 0; + cifs_create_junction_fattr(fattr, sb); + goto out; + } } if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h index 4b379e84c46b9..3253a18ecb5cb 100644 --- a/fs/smb/common/smbfsctl.h +++ b/fs/smb/common/smbfsctl.h @@ -159,6 +159,9 @@ #define IO_REPARSE_TAG_LX_CHR 0x80000025 #define IO_REPARSE_TAG_LX_BLK 0x80000026 +/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */ +#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000)) + /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ #define SMB2_0_IOCTL_IS_FSCTL 0x00000001 -- GitLab From c84e125fff2615b4d9c259e762596134eddd2f27 Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Sat, 15 Feb 2025 00:51:48 +0300 Subject: [PATCH 763/989] ovl: fix UAF in ovl_dentry_update_reval by moving dput() in ovl_link_up The issue was caused by dput(upper) being called before ovl_dentry_update_reval(), while upper->d_flags was still accessed in ovl_dentry_remote(). Move dput(upper) after its last use to prevent use-after-free. BUG: KASAN: slab-use-after-free in ovl_dentry_remote fs/overlayfs/util.c:162 [inline] BUG: KASAN: slab-use-after-free in ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0xc3/0x620 mm/kasan/report.c:488 kasan_report+0xd9/0x110 mm/kasan/report.c:601 ovl_dentry_remote fs/overlayfs/util.c:162 [inline] ovl_dentry_update_reval+0xd2/0xf0 fs/overlayfs/util.c:167 ovl_link_up fs/overlayfs/copy_up.c:610 [inline] ovl_copy_up_one+0x2105/0x3490 fs/overlayfs/copy_up.c:1170 ovl_copy_up_flags+0x18d/0x200 fs/overlayfs/copy_up.c:1223 ovl_rename+0x39e/0x18c0 fs/overlayfs/dir.c:1136 vfs_rename+0xf84/0x20a0 fs/namei.c:4893 ... Fixes: b07d5cc93e1b ("ovl: update of dentry revalidate flags after copy up") Reported-by: syzbot+316db8a1191938280eb6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=316db8a1191938280eb6 Signed-off-by: Vasiliy Kovalev Link: https://lore.kernel.org/r/20250214215148.761147-1-kovalev@altlinux.org Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/overlayfs/copy_up.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0c28e5fa34077..d7310fcf38881 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -618,7 +618,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); - dput(upper); if (!err) { /* Restore timestamps on parent (best effort) */ @@ -626,6 +625,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } + dput(upper); } inode_unlock(udir); if (err) -- GitLab From 838c17fd077e611b12c78feb0feee1b30ed09b63 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 11 Feb 2025 09:53:55 +0800 Subject: [PATCH 764/989] accel/amdxdna: Add missing include linux/slab.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling without CONFIG_IA32_EMULATION, there can be some errors: drivers/accel/amdxdna/amdxdna_mailbox.c: In function ‘mailbox_release_msg’: drivers/accel/amdxdna/amdxdna_mailbox.c:197:2: error: implicit declaration of function ‘kfree’. 197 | kfree(mb_msg); | ^~~~~ drivers/accel/amdxdna/amdxdna_mailbox.c: In function ‘xdna_mailbox_send_msg’: drivers/accel/amdxdna/amdxdna_mailbox.c:418:11: error:implicit declaration of function ‘kzalloc’. 418 | mb_msg = kzalloc(sizeof(*mb_msg) + pkg_size, GFP_KERNEL); | ^~~~~~~ Add the missing include. Fixes: b87f920b9344 ("accel/amdxdna: Support hardware mailbox") Signed-off-by: Su Hui Reviewed-by: Lizhi Hou Reviewed-by: Jeffrey Hugo Signed-off-by: Lizhi Hou Link: https://patchwork.freedesktop.org/patch/msgid/20250211015354.3388171-1-suhui@nfschina.com --- drivers/accel/amdxdna/amdxdna_mailbox.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c index 814b16bb1953f..e5301fac13971 100644 --- a/drivers/accel/amdxdna/amdxdna_mailbox.c +++ b/drivers/accel/amdxdna/amdxdna_mailbox.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS -- GitLab From b9ddb3e1a8aa86c61c4a93e27cf66414f5fa7b6e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 13 Feb 2025 12:43:42 -0500 Subject: [PATCH 765/989] bcachefs: Fix fsck directory i_size checking Error handling was wrong, causing unhandled transaction restart errors. check_directory_size() was also inefficient, since keys in multiple snapshots would be iterated over once for every snapshot. Convert it to the same scheme used for i_sectors and subdir count checking. Cc: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 78 +++++++++++++++----------------------- fs/bcachefs/sb-downgrade.c | 2 +- 2 files changed, 32 insertions(+), 48 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 53a421ff136d3..9bf316e7b845d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -823,6 +823,7 @@ struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; u64 count; + u64 i_size; }; struct inode_walker { @@ -910,8 +911,9 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_ if (k.k->p.snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - new.snapshot = k.k->p.snapshot; - new.count = 0; + new.snapshot = k.k->p.snapshot; + new.count = 0; + new.i_size = 0; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -1116,37 +1118,6 @@ static int get_snapshot_root_inode(struct btree_trans *trans, return ret; } -static int check_directory_size(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - struct bkey_s_c inode_k, bool *write_inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 new_size = 0; - int ret; - - for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), - POS(inode_k.k->p.offset, U64_MAX), - 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; - - struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(dirent); - - new_size += dirent_occupied_size(&name); - } - bch2_trans_iter_exit(trans, &iter); - - if (!ret && inode_u->bi_size != new_size) { - inode_u->bi_size = new_size; - *write_inode = true; - } - - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans, u.bi_journal_seq = journal_cur_seq(&c->journal); do_update = true; } - - if (S_ISDIR(u.bi_mode)) { - ret = check_directory_size(trans, &u, k, &do_update); - - fsck_err_on(ret, - trans, directory_size_mismatch, - "directory inode %llu:%u with the mismatch directory size", - u.bi_inum, k.k->p.snapshot); - ret = 0; - } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); @@ -2017,10 +1978,31 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return ret; } -static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + darray_for_each(w->inodes, i) + if (fsck_err_on(i->inode.bi_size != i->i_size, + trans, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_size: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { + i->inode.bi_size = i->i_size; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) + break; + } +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: + check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } @@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { - ret = check_subdir_count(trans, dir); + ret = check_subdir_dirents_count(trans, dir); if (ret) goto err; } @@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { + if (d.v->d_type == DT_DIR) i->count++; + i->i_size += bkey_bytes(d.k); + } out: err: fsck_err: diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 14f6b6a5fb383..35e07bc8fbd34 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -92,7 +92,7 @@ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ BCH_FSCK_ERR_directory_size_mismatch) \ #define DOWNGRADE_TABLE() \ -- GitLab From b9275eabe31e6679ae12c46a4a0a18d622db4570 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 18 Feb 2025 00:38:27 +0200 Subject: [PATCH 766/989] drm/i915/dp: Fix error handling during 128b/132b link training At the end of a 128b/132b link training sequence, the HW expects the transcoder training pattern to be set to TPS2 and from that to normal mode (disabling the training pattern). Transitioning from TPS1 directly to normal mode leaves the transcoder in a stuck state, resulting in page-flip timeouts later in the modeset sequence. Atm, in case of a failure during link training, the transcoder may be still set to output the TPS1 pattern. Later the transcoder is then set from TPS1 directly to normal mode in intel_dp_stop_link_train(), leading to modeset failures later as described above. Fix this by setting the training patter to TPS2, if the link training failed at any point. The clue in the specification about the above HW behavior is the explicit mention that TPS2 must be set after the link training sequence (and there isn't a similar requirement specified for the 8b/10b link training), see the Bspec links below. v2: Add bspec aspect/link to the commit log. (Jani) Bspec: 54128, 65448, 68849 Cc: stable@vger.kernel.org # v5.18+ Cc: Jani Nikula Signed-off-by: Imre Deak Acked-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20250217223828.1166093-2-imre.deak@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 8b4bbaf8ddc1f68f3ee96a706f65fdb1bcd9d355) Signed-off-by: Rodrigo Vivi --- .../gpu/drm/i915/display/intel_dp_link_training.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c index 8b1977cfec503..6696a32cdd3e6 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c +++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c @@ -1563,7 +1563,7 @@ intel_dp_128b132b_link_train(struct intel_dp *intel_dp, if (wait_for(intel_dp_128b132b_intra_hop(intel_dp, crtc_state) == 0, 500)) { lt_err(intel_dp, DP_PHY_DPRX, "128b/132b intra-hop not clear\n"); - return false; + goto out; } if (intel_dp_128b132b_lane_eq(intel_dp, crtc_state) && @@ -1575,6 +1575,19 @@ intel_dp_128b132b_link_train(struct intel_dp *intel_dp, passed ? "passed" : "failed", crtc_state->port_clock, crtc_state->lane_count); +out: + /* + * Ensure that the training pattern does get set to TPS2 even in case + * of a failure, as is the case at the end of a passing link training + * and what is expected by the transcoder. Leaving TPS1 set (and + * disabling the link train mode in DP_TP_CTL later from TPS1 directly) + * would result in a stuck transcoder HW state and flip-done timeouts + * later in the modeset sequence. + */ + if (!passed) + intel_dp_program_link_training_pattern(intel_dp, crtc_state, + DP_PHY_DPRX, DP_TRAINING_PATTERN_2); + return passed; } -- GitLab From 8058b49bf6fff777bf3f47309c7b15dbef2191af Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 18 Feb 2025 00:38:28 +0200 Subject: [PATCH 767/989] drm/i915/dp: Fix disabling the transcoder function in 128b/132b mode During disabling the transcoder in DP 128b/132b mode (both in case of an MST master transcoder and in case of SST) the transcoder function must be first disabled without changing any other field in the register (in particular leaving the DDI port and mode select fields unchanged) and clearing the DDI port and mode select fields separately, later during the disabling sequences. Fix the sequence accordingly. Bspec: 54128, 65448, 68849 Cc: Jani Nikula Fixes: 79a6734cd56e ("drm/i915/ddi: disable trancoder port select for 128b/132b SST") Signed-off-by: Imre Deak Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20250217223828.1166093-3-imre.deak@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 2ed653c7b843db0670136330480842d76cb65cd8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 2b9240ab547d8..18c66992aa1d8 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -658,7 +658,6 @@ void intel_ddi_disable_transcoder_func(const struct intel_crtc_state *crtc_state struct intel_crtc *crtc = to_intel_crtc(crtc_state->uapi.crtc); struct drm_i915_private *dev_priv = to_i915(crtc->base.dev); enum transcoder cpu_transcoder = crtc_state->cpu_transcoder; - bool is_mst = intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DP_MST); u32 ctl; if (DISPLAY_VER(dev_priv) >= 11) @@ -678,8 +677,7 @@ void intel_ddi_disable_transcoder_func(const struct intel_crtc_state *crtc_state TRANS_DDI_PORT_SYNC_MASTER_SELECT_MASK); if (DISPLAY_VER(dev_priv) >= 12) { - if (!intel_dp_mst_is_master_trans(crtc_state) || - (!is_mst && intel_dp_is_uhbr(crtc_state))) { + if (!intel_dp_mst_is_master_trans(crtc_state)) { ctl &= ~(TGL_TRANS_DDI_PORT_MASK | TRANS_DDI_MODE_SELECT_MASK); } @@ -3134,7 +3132,7 @@ static void intel_ddi_post_disable_dp(struct intel_atomic_state *state, intel_dp_set_power(intel_dp, DP_SET_POWER_D3); if (DISPLAY_VER(dev_priv) >= 12) { - if (is_mst) { + if (is_mst || intel_dp_is_uhbr(old_crtc_state)) { enum transcoder cpu_transcoder = old_crtc_state->cpu_transcoder; intel_de_rmw(dev_priv, -- GitLab From 67b0025d19f99fb9fbb8b62e6975553c183f3a16 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Feb 2025 01:33:37 +0000 Subject: [PATCH 768/989] io_uring/rw: forbid multishot async reads At the moment we can't sanely handle queuing an async request from a multishot context, so disable them. It shouldn't matter as pollable files / socekts don't normally do async. Patching it in __io_read() is not the cleanest way, but it's simpler than other options, so let's fix it there and clean up on top. Cc: stable@vger.kernel.org Reported-by: chase xd Fixes: fc68fcda04910 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7d51732c125159d17db4fe16f51ec41b936973f8.1739919038.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 7aa1e4c9f64a3..e8efd97fdee5b 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -880,7 +880,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; - ret = io_iter_do_read(rw, &io->iter); + if (unlikely(req->opcode == IORING_OP_READ_MULTISHOT)) { + void *cb_copy = rw->kiocb.ki_complete; + + rw->kiocb.ki_complete = NULL; + ret = io_iter_do_read(rw, &io->iter); + rw->kiocb.ki_complete = cb_copy; + } else { + ret = io_iter_do_read(rw, &io->iter); + } /* * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT @@ -904,7 +912,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || + (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done; } -- GitLab From 4e43133c6f2319d3e205ea986c507b25d9b41e64 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Feb 2025 01:33:38 +0000 Subject: [PATCH 769/989] io_uring/rw: don't directly use ki_complete We want to avoid checking ->ki_complete directly in the io_uring completion path. Fortunately we have only two callback the selection of which depend on the ring constant flags, i.e. IOPOLL, so use that to infer the function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4eb4bdab8cbcf5bc87083f7047edc81e920ab83c.1739919038.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index e8efd97fdee5b..27ccc82d78436 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -563,8 +563,10 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) smp_store_release(&req->iopoll_completed, 1); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + /* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return; @@ -586,8 +588,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } - INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, - io_complete_rw, kiocb, ret); + if (req->ctx->flags & IORING_SETUP_IOPOLL) + io_complete_rw_iopoll(&rw->kiocb, ret); + else + io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -598,7 +602,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { + if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_complete_rw_common(req, ret); /* * Safe to call io_end from here as we're inline @@ -609,7 +613,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_rw_cleanup(req, issue_flags); return IOU_OK; } else { - io_rw_done(&rw->kiocb, ret); + io_rw_done(req, ret); } return IOU_ISSUE_SKIP_COMPLETE; -- GitLab From 74f3e875268f1ce2dd01029c29560263212077df Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Feb 2025 01:33:39 +0000 Subject: [PATCH 770/989] io_uring/rw: move ki_complete init into prep Initialise ki_complete during request prep stage, we'll depend on it not being reset during issue in the following patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/817624086bd5f0448b08c80623399919fda82f34.1739919038.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 27ccc82d78436..d162565053892 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -23,6 +23,9 @@ #include "poll.h" #include "rw.h" +static void io_complete_rw(struct kiocb *kiocb, long res); +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -289,6 +292,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; + if (req->ctx->flags & IORING_SETUP_IOPOLL) + rw->kiocb.ki_complete = io_complete_rw_iopoll; + else + rw->kiocb.ki_complete = io_complete_rw; + rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); @@ -817,10 +825,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; - kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; - kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { /* make sure every req only blocks once*/ @@ -830,7 +836,6 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; - kiocb->ki_complete = io_complete_rw; } if (req->flags & REQ_F_HAS_METADATA) { -- GitLab From 4614de748e78a295ee9b1f54ca87280b101fbdf0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Feb 2025 01:33:40 +0000 Subject: [PATCH 771/989] io_uring/rw: clean up mshot forced sync mode Move code forcing synchronous execution of multishot read requests out a more generic __io_read(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4ad7b928c776d1ad59addb9fff64ef2d1fc474d5.1739919038.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index d162565053892..9edc6baebd01c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -889,15 +889,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; - if (unlikely(req->opcode == IORING_OP_READ_MULTISHOT)) { - void *cb_copy = rw->kiocb.ki_complete; - - rw->kiocb.ki_complete = NULL; - ret = io_iter_do_read(rw, &io->iter); - rw->kiocb.ki_complete = cb_copy; - } else { - ret = io_iter_do_read(rw, &io->iter); - } + ret = io_iter_do_read(rw, &io->iter); /* * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT @@ -995,6 +987,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (!io_file_can_poll(req)) return -EBADFD; + /* make it sync, multishot doesn't support async execution */ + rw->kiocb.ki_complete = NULL; ret = __io_read(req, issue_flags); /* -- GitLab From fcf857ee1958e9247298251f7615d0c76f1e9b38 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 14:59:02 -0500 Subject: [PATCH 772/989] NFS: O_DIRECT writes must check and adjust the file length While it is uncommon for delegations to be held while O_DIRECT writes are in progress, it is possible. The xfstests generic/647 and generic/729 both end up triggering that state, and end up failing due to the fact that the file size is not adjusted. Reported-by: Chuck Lever Link: https://bugzilla.kernel.org/show_bug.cgi?id=219738 Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f45beea92d034..40e13c9a2873f 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -130,6 +130,20 @@ static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, dreq->count = req_start; } +static void nfs_direct_file_adjust_size_locked(struct inode *inode, + loff_t offset, size_t count) +{ + loff_t newsize = offset + (loff_t)count; + loff_t oldsize = i_size_read(inode); + + if (newsize > oldsize) { + i_size_write(inode, newsize); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; + trace_nfs_size_grow(inode, newsize); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + } +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -741,6 +755,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; trace_nfs_direct_write_completion(dreq); @@ -762,6 +777,10 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } spin_unlock(&dreq->lock); + spin_lock(&inode->i_lock); + nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + spin_unlock(&inode->i_lock); + while (!list_empty(&hdr->pages)) { req = nfs_list_entry(hdr->pages.next); -- GitLab From 88025c67fe3c025a0123bc7af50535b97f7af89a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 14:59:03 -0500 Subject: [PATCH 773/989] NFS: Adjust delegated timestamps for O_DIRECT reads and writes Adjust the timestamps if O_DIRECT is being combined with attribute delegations. Fixes: e12912d94137 ("NFSv4: Add support for delegated atime and mtime attributes") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/direct.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 40e13c9a2873f..f32f8d7c9122b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -56,6 +56,7 @@ #include #include +#include "delegation.h" #include "internal.h" #include "iostat.h" #include "pnfs.h" @@ -286,6 +287,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); + nfs_update_delegated_atime(dreq->inode); + while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; @@ -779,6 +782,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) spin_lock(&inode->i_lock); nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + nfs_update_delegated_mtime_locked(dreq->inode); spin_unlock(&inode->i_lock); while (!list_empty(&hdr->pages)) { -- GitLab From 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 15:00:02 -0500 Subject: [PATCH 774/989] SUNRPC: Prevent looping due to rpc_signal_task() races If rpc_signal_task() is called while a task is in an rpc_call_done() callback function, and the latter calls rpc_restart_call(), the task can end up looping due to the RPC_TASK_SIGNALLED flag being set without the tk_rpc_status being set. Removing the redundant mechanism for signalling the task fixes the looping behaviour. Reported-by: Li Lingfeng Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 +-- include/trace/events/sunrpc.h | 3 +-- net/sunrpc/sched.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fec1e8a1570c3..eac57914dcf32 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -158,7 +158,6 @@ enum { RPC_TASK_NEED_XMIT, RPC_TASK_NEED_RECV, RPC_TASK_MSG_PIN_WAIT, - RPC_TASK_SIGNALLED, }; #define rpc_test_and_set_running(t) \ @@ -171,7 +170,7 @@ enum { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) /* * Task priorities. diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b13dc275ef4a7..851841336ee65 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -360,8 +360,7 @@ TRACE_EVENT(rpc_request, { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ - { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ - { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) + { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) DECLARE_EVENT_CLASS(rpc_task_running, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef623ea15060..9b45fbdc90cab 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); -- GitLab From 8f8df955f078e1a023ee55161935000a67651f38 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 15:00:09 -0500 Subject: [PATCH 775/989] NFSv4: Fix a deadlock when recovering state on a sillyrenamed file If the file is sillyrenamed, and slated for delete on close, it is possible for a server reboot to triggeer an open reclaim, with can again race with the application call to close(). When that happens, the call to put_nfs_open_context() can trigger a synchronous delegreturn call which deadlocks because it is not marked as privileged. Instead, ensure that the call to nfs4_inode_return_delegation_on_close() catches the delegreturn, and schedules it asynchronously. Reported-by: Li Lingfeng Fixes: adb4b42d19ae ("Return the delegation when deleting sillyrenamed files") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 37 +++++++++++++++++++++++++++++++++++++ fs/nfs/delegation.h | 1 + fs/nfs/nfs4proc.c | 3 +++ 3 files changed, 41 insertions(+) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 035ba52742a50..4db912f562305 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -780,6 +780,43 @@ int nfs4_inode_return_delegation(struct inode *inode) return 0; } +/** + * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called to request that the delegation be returned as soon + * as the file is closed. If the file is already closed, the delegation is + * immediately returned. + */ +void nfs4_inode_set_return_delegation_on_close(struct inode *inode) +{ + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + spin_lock(&delegation->lock); + if (!delegation->inode) + goto out_unlock; + if (list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out_unlock: + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); +} + /** * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 71524d34ed207..8ff5ab9c5c256 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -49,6 +49,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, unsigned long pagemod_limit, u32 deleg_type); int nfs4_inode_return_delegation(struct inode *inode); void nfs4_inode_return_delegation_on_close(struct inode *inode); +void nfs4_inode_set_return_delegation_on_close(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_evict_delegation(struct inode *inode); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df9669d4ded7f..c25ecdb76d304 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3906,8 +3906,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { + struct dentry *dentry = ctx->dentry; if (ctx->state == NULL) return; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + nfs4_inode_set_return_delegation_on_close(d_inode(dentry)); if (is_sync) nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else -- GitLab From 7a2f6f7687c5f7083a35317cddec5ad9fa491443 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 11 Feb 2025 12:31:57 -0500 Subject: [PATCH 776/989] SUNRPC: Handle -ETIMEDOUT return from tlshd If the TLS handshake attempt returns -ETIMEDOUT, we currently translate that error into -EACCES. This becomes problematic for cases where the RPC layer is attempting to re-connect in paths that don't resonably handle -EACCES, for example: writeback. The RPC layer can handle -ETIMEDOUT quite well, however - so if the handshake returns this error let's just pass it along. Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class") Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c60936d8cef71..6b80b2aaf7639 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2581,7 +2581,15 @@ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); - lower_transport->xprt_err = status ? -EACCES : 0; + switch (status) { + case 0: + case -EACCES: + case -ETIMEDOUT: + lower_transport->xprt_err = status; + break; + default: + lower_transport->xprt_err = -EACCES; + } complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } -- GitLab From 102c51c50db88aedd00a318b7708ad60dbec2e95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 14 Feb 2025 13:37:24 +0000 Subject: [PATCH 777/989] KVM: arm64: Fix tcr_el2 initialisation in hVHE mode When not running in VHE mode, cpu_prepare_hyp_mode() computes the value of TCR_EL2 using the host's TCR_EL1 settings as a starting point. For nVHE, this amounts to masking out everything apart from the TG0, SH0, ORGN0, IRGN0 and T0SZ fields before setting the RES1 bits, shifting the IPS field down to the PS field and setting DS if LPA2 is enabled. Unfortunately, for hVHE, things go slightly wonky: EPD1 is correctly set to disable walks via TTBR1_EL2 but then the T1SZ and IPS fields are corrupted when we mistakenly attempt to initialise the PS and DS fields in their E2H=0 positions. Furthermore, many fields are retained from TCR_EL1 which should not be propagated to TCR_EL2. Notably, this means we can end up with A1 set despite not initialising TTBR1_EL2 at all. This has been shown to cause unexpected translation faults at EL2 with pKVM due to TLB invalidation not taking effect when running with a non-zero ASID. Fix the TCR_EL2 initialisation code to set PS and DS only when E2H=0, masking out HD, HA and A1 when E2H=1. Cc: Marc Zyngier Cc: Oliver Upton Fixes: ad744e8cb346 ("arm64: Allow arm64_sw.hvhe on command line") Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20250214133724.13179-1-will@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/kvm/arm.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8d94a6c0ed5c4..c2417a424b98d 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -119,7 +119,7 @@ #define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK #define TCR_EL2_T0SZ_MASK 0x3f #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ - TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_DS TCR_EL2_DS diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282f..bc7a37cea2420 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1980,7 +1980,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - unsigned long tcr, ips; + unsigned long tcr; /* * Calculate the raw per-cpu offset without a translation from the @@ -1994,19 +1994,18 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); tcr = read_sysreg(tcr_el1); - ips = FIELD_GET(TCR_IPS_MASK, tcr); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); tcr |= TCR_EPD1_MASK; } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + tcr &= TCR_EL2_MASK; - tcr |= TCR_EL2_RES1; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; } - tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); - tcr &= ~TCR_EL2_PS_MASK; - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips); - if (lpa2_is_enabled()) - tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); -- GitLab From 4fd509c10f9687f54752fbcaf83f520c93fc1f18 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Feb 2025 13:45:02 -0500 Subject: [PATCH 778/989] bcachefs: Fix bch2_indirect_extent_missing_error() We had some error handling confusion here; -BCH_ERR_missing_indirect_extent is thrown by trans_trigger_reflink_p_segment(); at this point we haven't decide whether we're generating an error. Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 376fd0a6e868c..441e648f28b51 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -172,7 +172,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, bool should_commit) { if (REFLINK_P_ERROR(p.v)) - return -BCH_ERR_missing_indirect_extent; + return 0; struct bch_fs *c = trans->c; u64 live_start = REFLINK_P_IDX(p.v); @@ -259,8 +259,6 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, return k; if (unlikely(!bkey_extent_is_reflink_data(k.k))) { - bch2_trans_iter_exit(trans, iter); - unsigned size = min((u64) k.k->size, REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - reflink_offset); @@ -268,14 +266,16 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, k.k->p.offset, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } else if (unlikely(REFLINK_P_ERROR(p.v))) { - bch2_trans_iter_exit(trans, iter); - int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, iter); return bkey_s_c_err(ret); + } } *offset_into_extent = reflink_offset - bkey_start_offset(k.k); @@ -300,7 +300,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (ret) return ret; - if (bkey_deleted(k.k)) { + if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) ret = -BCH_ERR_missing_indirect_extent; goto next; @@ -381,8 +381,6 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); - if (ret == -BCH_ERR_missing_indirect_extent) - ret = 0; if (ret) goto err; } -- GitLab From b04974f759ac7574d8556deb7c602a8d01a0dcc6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Feb 2025 15:40:03 -0500 Subject: [PATCH 779/989] bcachefs: Fix srcu lock warning in btree_update_nodes_written() We don't want to be holding the srcu lock while waiting on btree write completions - easily fixed. Reported-by: Janpieter Sollie Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f4aeadbe53c1a..e4e7c804625e0 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -681,9 +681,11 @@ static void btree_update_nodes_written(struct btree_update *as) b = as->old_nodes[i]; + bch2_trans_begin(trans); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); seq = b->data ? b->data->keys.seq : 0; six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); if (seq == as->old_nodes_seq[i]) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, -- GitLab From 4ccacf86491d33d2486b62d4d44864d7101b299d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 12:37:04 -0800 Subject: [PATCH 780/989] gtp: Suppress list corruption splat in gtp_net_exit_batch_rtnl(). Brad Spengler reported the list_del() corruption splat in gtp_net_exit_batch_rtnl(). [0] Commit eb28fd76c0a0 ("gtp: Destroy device along with udp socket's netns dismantle.") added the for_each_netdev() loop in gtp_net_exit_batch_rtnl() to destroy devices in each netns as done in geneve and ip tunnels. However, this could trigger ->dellink() twice for the same device during ->exit_batch_rtnl(). Say we have two netns A & B and gtp device B that resides in netns B but whose UDP socket is in netns A. 1. cleanup_net() processes netns A and then B. 2. gtp_net_exit_batch_rtnl() finds the device B while iterating netns A's gn->gtp_dev_list and calls ->dellink(). [ device B is not yet unlinked from netns B as unregister_netdevice_many() has not been called. ] 3. gtp_net_exit_batch_rtnl() finds the device B while iterating netns B's for_each_netdev() and calls ->dellink(). gtp_dellink() cleans up the device's hash table, unlinks the dev from gn->gtp_dev_list, and calls unregister_netdevice_queue(). Basically, calling gtp_dellink() multiple times is fine unless CONFIG_DEBUG_LIST is enabled. Let's remove for_each_netdev() in gtp_net_exit_batch_rtnl() and delegate the destruction to default_device_exit_batch() as done in bareudp. [0]: list_del corruption, ffff8880aaa62c00->next (autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc00/0x1000 [slab object]) is LIST_POISON1 (ffffffffffffff02) (prev is 0xffffffffffffff04) kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 UID: 0 PID: 1804 Comm: kworker/u8:7 Tainted: G T 6.12.13-grsec-full-20250211091339 #1 Tainted: [T]=RANDSTRUCT Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Workqueue: netns cleanup_net RIP: 0010:[] __list_del_entry_valid_or_report+0x141/0x200 lib/list_debug.c:58 Code: c2 76 91 31 c0 e8 9f b1 f7 fc 0f 0b 4d 89 f0 48 c7 c1 02 ff ff ff 48 89 ea 48 89 ee 48 c7 c7 e0 c2 76 91 31 c0 e8 7f b1 f7 fc <0f> 0b 4d 89 e8 48 c7 c1 04 ff ff ff 48 89 ea 48 89 ee 48 c7 c7 60 RSP: 0018:fffffe8040b4fbd0 EFLAGS: 00010283 RAX: 00000000000000cc RBX: dffffc0000000000 RCX: ffffffff818c4054 RDX: ffffffff84947381 RSI: ffffffff818d1512 RDI: 0000000000000000 RBP: ffff8880aaa62c00 R08: 0000000000000001 R09: fffffbd008169f32 R10: fffffe8040b4f997 R11: 0000000000000001 R12: a1988d84f24943e4 R13: ffffffffffffff02 R14: ffffffffffffff04 R15: ffff8880aaa62c08 RBX: kasan shadow of 0x0 RCX: __wake_up_klogd.part.0+0x74/0xe0 kernel/printk/printk.c:4554 RDX: __list_del_entry_valid_or_report+0x141/0x200 lib/list_debug.c:58 RSI: vprintk+0x72/0x100 kernel/printk/printk_safe.c:71 RBP: autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc00/0x1000 [slab object] RSP: process kstack fffffe8040b4fbd0+0x7bd0/0x8000 [kworker/u8:7+netns 1804 ] R09: kasan shadow of process kstack fffffe8040b4f990+0x7990/0x8000 [kworker/u8:7+netns 1804 ] R10: process kstack fffffe8040b4f997+0x7997/0x8000 [kworker/u8:7+netns 1804 ] R15: autoslab_size_M_dev_P_net_core_dev_11127_8_1328_8_S_4096_A_64_n_139+0xc08/0x1000 [slab object] FS: 0000000000000000(0000) GS:ffff888116000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000748f5372c000 CR3: 0000000015408000 CR4: 00000000003406f0 shadow CR4: 00000000003406f0 Stack: 0000000000000000 ffffffff8a0c35e7 ffffffff8a0c3603 ffff8880aaa62c00 ffff8880aaa62c00 0000000000000004 ffff88811145311c 0000000000000005 0000000000000001 ffff8880aaa62000 fffffe8040b4fd40 ffffffff8a0c360d Call Trace: [] __list_del_entry_valid include/linux/list.h:131 [inline] fffffe8040b4fc28 [] __list_del_entry include/linux/list.h:248 [inline] fffffe8040b4fc28 [] list_del include/linux/list.h:262 [inline] fffffe8040b4fc28 [] gtp_dellink+0x16d/0x360 drivers/net/gtp.c:1557 fffffe8040b4fc28 [] gtp_net_exit_batch_rtnl+0x124/0x2c0 drivers/net/gtp.c:2495 fffffe8040b4fc88 [] cleanup_net+0x5a4/0xbe0 net/core/net_namespace.c:635 fffffe8040b4fcd0 [] process_one_work+0xbd7/0x2160 kernel/workqueue.c:3326 fffffe8040b4fd88 [] process_scheduled_works kernel/workqueue.c:3407 [inline] fffffe8040b4fec0 [] worker_thread+0x6b5/0xfa0 kernel/workqueue.c:3488 fffffe8040b4fec0 [] kthread+0x360/0x4c0 kernel/kthread.c:397 fffffe8040b4ff78 [] ret_from_fork+0x74/0xe0 arch/x86/kernel/process.c:172 fffffe8040b4ffb8 [] ret_from_fork_asm+0x29/0xc0 arch/x86/entry/entry_64.S:399 fffffe8040b4ffe8 Modules linked in: Fixes: eb28fd76c0a0 ("gtp: Destroy device along with udp socket's netns dismantle.") Reported-by: Brad Spengler Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250217203705.40342-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index d64740bf44ed3..b7b46c5e6399a 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -2481,11 +2481,6 @@ static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, list_for_each_entry(net, net_list, exit_list) { struct gtp_net *gn = net_generic(net, gtp_net_id); struct gtp_dev *gtp, *gtp_next; - struct net_device *dev; - - for_each_netdev(net, dev) - if (dev->rtnl_link_ops == >p_link_ops) - gtp_dellink(dev, dev_to_kill); list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) gtp_dellink(gtp->dev, dev_to_kill); -- GitLab From 62fab6eef61f245dc8797e3a6a5b890ef40e8628 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Feb 2025 12:37:05 -0800 Subject: [PATCH 781/989] geneve: Suppress list corruption splat in geneve_destroy_tunnels(). As explained in the previous patch, iterating for_each_netdev() and gn->geneve_list during ->exit_batch_rtnl() could trigger ->dellink() twice for the same device. If CONFIG_DEBUG_LIST is enabled, we will see a list_del() corruption splat in the 2nd call of geneve_dellink(). Let's remove for_each_netdev() in geneve_destroy_tunnels() and delegate that part to default_device_exit_batch(). Fixes: 9593172d93b9 ("geneve: Fix use-after-free in geneve_find_dev().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250217203705.40342-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index a1f674539965d..dbb3960126ee7 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1902,14 +1902,7 @@ static void geneve_destroy_tunnels(struct net *net, struct list_head *head) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; - struct net_device *dev, *aux; - /* gather any geneve devices that were moved into this ns */ - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &geneve_link_ops) - geneve_dellink(dev, head); - - /* now gather any other geneve devices that were created in this ns */ list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) geneve_dellink(geneve->dev, head); } -- GitLab From 3e5796862c692ea608d96f0a1437f9290f44953a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:07 -0800 Subject: [PATCH 782/989] flow_dissector: Fix handling of mixed port and port-range keys This patch fixes a bug in TC flower filter where rules combining a specific destination port with a source port range weren't working correctly. The specific case was when users tried to configure rules like: tc filter add dev ens38 ingress protocol ip flower ip_proto udp \ dst_port 5000 src_port 2000-3000 action drop The root cause was in the flow dissector code. While both FLOW_DISSECTOR_KEY_PORTS and FLOW_DISSECTOR_KEY_PORTS_RANGE flags were being set correctly in the classifier, the __skb_flow_dissect_ports() function was only populating one of them: whichever came first in the enum check. This meant that when the code needed both a specific port and a port range, one of them would be left as 0, causing the filter to not match packets as expected. Fix it by removing the either/or logic and instead checking and populating both key types independently when they're in use. Fixes: 8ffb055beae5 ("cls_flower: Fix the behavior using port ranges with hw-offload") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250218043210.732959-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5db41bf2ed93e..c33af3ef0b790 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void -- GitLab From dfc1580f960bf70bdaacda8f3d644e3e58160f9d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:08 -0800 Subject: [PATCH 783/989] selftests/net/forwarding: Add a test case for tc-flower of mixed port and port-range After this patch: # ./tc_flower_port_range.sh TEST: Port range matching - IPv4 UDP [ OK ] TEST: Port range matching - IPv4 TCP [ OK ] TEST: Port range matching - IPv6 UDP [ OK ] TEST: Port range matching - IPv6 TCP [ OK ] TEST: Port range matching - IPv4 UDP Drop [ OK ] Cc: Qiang Zhang Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250218043210.732959-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/tc_flower_port_range.sh | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh index 3885a2a91f7d8..baed5e380dae4 100755 --- a/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh +++ b/tools/testing/selftests/net/forwarding/tc_flower_port_range.sh @@ -20,6 +20,7 @@ ALL_TESTS=" test_port_range_ipv4_tcp test_port_range_ipv6_udp test_port_range_ipv6_tcp + test_port_range_ipv4_udp_drop " NUM_NETIFS=4 @@ -194,6 +195,51 @@ test_port_range_ipv6_tcp() __test_port_range $proto $ip_proto $sip $dip $mode "$name" } +test_port_range_ipv4_udp_drop() +{ + local proto=ipv4 + local ip_proto=udp + local sip=192.0.2.1 + local dip=192.0.2.2 + local mode="-4" + local name="IPv4 UDP Drop" + local dmac=$(mac_get $h2) + local smac=$(mac_get $h1) + local sport_min=2000 + local sport_max=3000 + local sport_mid=$((sport_min + (sport_max - sport_min) / 2)) + local dport=5000 + + RET=0 + + tc filter add dev $swp1 ingress protocol $proto handle 101 pref 1 \ + flower src_ip $sip dst_ip $dip ip_proto $ip_proto \ + src_port $sport_min-$sport_max \ + dst_port $dport \ + action drop + + # Test ports outside range - should pass + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$((sport_min - 1)),dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$((sport_max + 1)),dp=$dport" + + # Test ports inside range - should be dropped + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_min,dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_mid,dp=$dport" + $MZ $mode $h1 -c 1 -q -p 100 -a $smac -b $dmac -A $sip -B $dip \ + -t $ip_proto "sp=$sport_max,dp=$dport" + + tc_check_packets "dev $swp1 ingress" 101 3 + check_err $? "Filter did not drop the expected number of packets" + + tc filter del dev $swp1 ingress protocol $proto pref 1 handle 101 flower + + log_test "Port range matching - $name" +} + setup_prepare() { h1=${NETIFS[p1]} -- GitLab From 69ab34f705fbfabcace64b5d53bb7a4450fac875 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:09 -0800 Subject: [PATCH 784/989] flow_dissector: Fix port range key handling in BPF conversion Fix how port range keys are handled in __skb_flow_bpf_to_target() by: - Separating PORTS and PORTS_RANGE key handling - Using correct key_ports_range structure for range keys - Properly initializing both key types independently This ensures port range information is correctly stored in its dedicated structure rather than incorrectly using the regular ports key structure. Fixes: 59fb9b62fb6c ("flow_dissector: Fix to use new variables for port ranges in bpf hook") Reported-by: Qiang Zhang Closes: https://lore.kernel.org/netdev/CAPx+-5uvFxkhkz4=j_Xuwkezjn9U6kzKTD5jz4tZ9msSJ0fOJA@mail.gmail.com/ Cc: Yoshiki Komachi Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250218043210.732959-4-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c33af3ef0b790..9cd8de6bebb54 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -931,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -975,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { -- GitLab From 15de6ba95dbe98af7eb71e644205a37c2f1a9aea Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 17 Feb 2025 20:32:10 -0800 Subject: [PATCH 785/989] selftests/bpf: Add a specific dst port matching After this patch: #102/1 flow_dissector_classification/ipv4:OK #102/2 flow_dissector_classification/ipv4_continue_dissect:OK #102/3 flow_dissector_classification/ipip:OK #102/4 flow_dissector_classification/gre:OK #102/5 flow_dissector_classification/port_range:OK #102/6 flow_dissector_classification/ipv6:OK #102 flow_dissector_classification:OK Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED Cc: Daniel Borkmann Cc: Andrii Nakryiko Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250218043210.732959-5-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../bpf/prog_tests/flow_dissector_classification.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c index 3729fbfd30846..80b153d3ddecf 100644 --- a/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c +++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_classification.c @@ -542,8 +542,12 @@ static void detach_program(struct bpf_flow *skel, int prog_fd) static int set_port_drop(int pf, bool multi_port) { + char dst_port[16]; + + snprintf(dst_port, sizeof(dst_port), "%d", CFG_PORT_INNER); + SYS(fail, "tc qdisc add dev lo ingress"); - SYS(fail_delete_qdisc, "tc filter add %s %s %s %s %s %s %s %s %s %s", + SYS(fail_delete_qdisc, "tc filter add %s %s %s %s %s %s %s %s %s %s %s %s", "dev lo", "parent FFFF:", "protocol", pf == PF_INET6 ? "ipv6" : "ip", @@ -551,6 +555,7 @@ static int set_port_drop(int pf, bool multi_port) "flower", "ip_proto udp", "src_port", multi_port ? "8-10" : "9", + "dst_port", dst_port, "action drop"); return 0; -- GitLab From 606572eb22c1786a3957d24307f5760bb058ca19 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Tue, 18 Feb 2025 16:12:16 +0800 Subject: [PATCH 786/989] sctp: Fix undefined behavior in left shift operation According to the C11 standard (ISO/IEC 9899:2011, 6.5.7): "If E1 has a signed type and E1 x 2^E2 is not representable in the result type, the behavior is undefined." Shifting 1 << 31 causes signed integer overflow, which leads to undefined behavior. Fix this by explicitly using '1U << 31' to ensure the shift operates on an unsigned type, avoiding undefined behavior. Signed-off-by: Yu-Chun Lin Link: https://patch.msgid.link/20250218081217.3468369-1-eleanor15x@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c241cc552e8d5..bfcff6d6a4386 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ - init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); -- GitLab From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: [PATCH 787/989] net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85daa..94b7d4eca0030 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); diff --git a/net/core/dev.c b/net/core/dev.c index fafd2f4b5d5d7..72459dd02f384 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1121,6 +1121,12 @@ int netdev_get_name(struct net *net, char *name, int ifindex) return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1129,7 +1135,7 @@ int netdev_get_name(struct net *net, char *name, int ifindex) * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1141,14 +1147,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; -- GitLab From 4eae0ee0f1e6256d0b0b9dd6e72f1d9cf8f72e08 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:31 -0800 Subject: [PATCH 788/989] arp: switch to dev_getbyhwaddr() in arp_req_set_public() The arp_req_set_public() function is called with the rtnl lock held, which provides enough synchronization protection. This makes the RCU variant of dev_getbyhwaddr() unnecessary. Switch to using the simpler dev_getbyhwaddr() function since we already have the required rtnl locking. This change helps maintain consistency in the networking code by using the appropriate helper function for the existing locking context. Since we're not holding the RCU read lock in arp_req_set_public() existing code could trigger false positive locking warnings. Fixes: 941666c2e3e0 ("net: RCU conversion of dev_getbyhwaddr() and arp_ioctl()") Suggested-by: Kuniyuki Iwashima Reviewed-by: Kuniyuki Iwashima Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-2-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f23a1ec6694cb..814300eee39de 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1077,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; -- GitLab From 3d8c6f26893d55fab218ad086719de1fc9bb86ba Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 10 Feb 2025 13:31:11 +0200 Subject: [PATCH 789/989] RDMA/mlx5: Fix implicit ODP hang on parent deregistration Fix the destroy_unused_implicit_child_mr() to prevent hanging during parent deregistration as of below [1]. Upon entering destroy_unused_implicit_child_mr(), the reference count for the implicit MR parent is incremented using: refcount_inc_not_zero(). A corresponding decrement must be performed if free_implicit_child_mr_work() is not called. The code has been updated to properly manage the reference count that was incremented. [1] INFO: task python3:2157 blocked for more than 120 seconds. Not tainted 6.12.0-rc7+ #1633 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:python3 state:D stack:0 pid:2157 tgid:2157 ppid:1685 flags:0x00000000 Call Trace: __schedule+0x420/0xd30 schedule+0x47/0x130 __mlx5_ib_dereg_mr+0x379/0x5d0 [mlx5_ib] ? __pfx_autoremove_wake_function+0x10/0x10 ib_dereg_mr_user+0x5f/0x120 [ib_core] ? lock_release+0xc6/0x280 destroy_hw_idr_uobject+0x1d/0x60 [ib_uverbs] uverbs_destroy_uobject+0x58/0x1d0 [ib_uverbs] uobj_destroy+0x3f/0x70 [ib_uverbs] ib_uverbs_cmd_verbs+0x3e4/0xbb0 [ib_uverbs] ? __pfx_uverbs_destroy_def_handler+0x10/0x10 [ib_uverbs] ? lock_acquire+0xc1/0x2f0 ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] ? ib_uverbs_ioctl+0x116/0x170 [ib_uverbs] ? lock_release+0xc6/0x280 ib_uverbs_ioctl+0xe7/0x170 [ib_uverbs] ? ib_uverbs_ioctl+0xcb/0x170 [ib_uverbs] __x64_sys_ioctl+0x1b0/0xa70 ? kmem_cache_free+0x221/0x400 do_syscall_64+0x6b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f20f21f017b RSP: 002b:00007ffcfc4a77c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffcfc4a78d8 RCX: 00007f20f21f017b RDX: 00007ffcfc4a78c0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffcfc4a78a0 R08: 000056147d125190 R09: 00007f20f1f14c60 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffcfc4a7890 R13: 000000000000001c R14: 000056147d100fc0 R15: 00007f20e365c9d0 Fixes: d3d930411ce3 ("RDMA/mlx5: Fix implicit ODP use after free") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Link: https://patch.msgid.link/80f2fcd19952dfa7d9981d93fd6359b4471f8278.1739186929.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1e23583e6c08..e77c9280c07e4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -242,6 +242,7 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != mr) { xa_unlock(&imr->implicit_children); + mlx5r_deref_odp_mkey(&imr->mmkey); return; } -- GitLab From c534ffda781f44a1c6ac25ef6e0e444da38ca8af Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 10 Feb 2025 13:32:39 +0200 Subject: [PATCH 790/989] RDMA/mlx5: Fix AH static rate parsing Previously static rate wasn't translated according to our PRM but simply used the 4 lower bytes. Correctly translate static rate value passed in AH creation attribute according to our PRM expected values. In addition change 800GB mapping to zero, which is the PRM specified value. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Signed-off-by: Patrisious Haddad Reviewed-by: Maor Gottlieb Link: https://patch.msgid.link/18ef4cc5396caf80728341eb74738cd777596f60.1739187089.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ah.c | 3 ++- drivers/infiniband/hw/mlx5/qp.c | 6 +++--- drivers/infiniband/hw/mlx5/qp.h | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 505bc47fd575d..99036afb3aef0 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -67,7 +67,8 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.tclass = grh->traffic_class; } - ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); + ah->av.stat_rate_sl = + (mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah_attr)) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (init_attr->xmit_slave) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 08d22db8dca91..88724d15705d4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3447,11 +3447,11 @@ static int ib_to_mlx5_rate_map(u8 rate) return 0; } -static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate) { u32 stat_rate_support; - if (rate == IB_RATE_PORT_CURRENT) + if (rate == IB_RATE_PORT_CURRENT || rate == IB_RATE_800_GBPS) return 0; if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_800_GBPS) @@ -3596,7 +3596,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, sizeof(grh->dgid.raw)); } - err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); + err = mlx5r_ib_rate(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; MLX5_SET(ads, path, stat_rate, err); diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index b6ee7c3ee1ca1..2530e7730635f 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -56,4 +56,5 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn); int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter); int mlx5_ib_qp_event_init(void); void mlx5_ib_qp_event_cleanup(void); +int mlx5r_ib_rate(struct mlx5_ib_dev *dev, u8 rate); #endif /* _MLX5_IB_QP_H */ -- GitLab From a370295367b55662a32a4be92565fe72a5aa79bb Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Mon, 17 Feb 2025 13:58:42 +0800 Subject: [PATCH 791/989] net: axienet: Set mac_managed_pm The external PHY will undergo a soft reset twice during the resume process when it wake up from suspend. The first reset occurs when the axienet driver calls phylink_of_phy_connect(), and the second occurs when mdio_bus_phy_resume() invokes phy_init_hw(). The second soft reset of the external PHY does not reinitialize the internal PHY, which causes issues with the internal PHY, resulting in the PHY link being down. To prevent this, setting the mac_managed_pm flag skips the mdio_bus_phy_resume() function. Fixes: a129b41fe0a8 ("Revert "net: phy: dp83867: perform soft reset and retain established link"") Signed-off-by: Nick Hu Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250217055843.19799-1-nick.hu@sifive.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 9e7fa012e4fad..f33178f90c42e 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2897,6 +2897,7 @@ static int axienet_probe(struct platform_device *pdev) lp->phylink_config.dev = &ndev->dev; lp->phylink_config.type = PHYLINK_NETDEV; + lp->phylink_config.mac_managed_pm = true; lp->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10FD | MAC_100FD | MAC_1000FD; -- GitLab From 9b6412e6979f6f9e0632075f8f008937b5cd4efd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Feb 2025 11:23:35 +0100 Subject: [PATCH 792/989] tcp: drop secpath at the same time as we currently drop dst Xiumei reported hitting the WARN in xfrm6_tunnel_net_exit while running tests that boil down to: - create a pair of netns - run a basic TCP test over ipcomp6 - delete the pair of netns The xfrm_state found on spi_byaddr was not deleted at the time we delete the netns, because we still have a reference on it. This lingering reference comes from a secpath (which holds a ref on the xfrm_state), which is still attached to an skb. This skb is not leaked, it ends up on sk_receive_queue and then gets defer-free'd by skb_attempt_defer_free. The problem happens when we defer freeing an skb (push it on one CPU's defer_list), and don't flush that list before the netns is deleted. In that case, we still have a reference on the xfrm_state that we don't expect at this point. We already drop the skb's dst in the TCP receive path when it's no longer needed, so let's also drop the secpath. At this point, tcp_filter has already called into the LSM hooks that may require the secpath, so it should not be needed anymore. However, in some of those places, the MPTCP extension has just been attached to the skb, so we cannot simply drop all extensions. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/5055ba8f8f72bdcb602faa299faca73c280b7735.1739743613.git.sd@queasysnail.net Signed-off-by: Paolo Abeni --- include/net/tcp.h | 14 ++++++++++++++ net/ipv4/tcp_fastopen.c | 4 ++-- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_ipv4.c | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688f..930cda5b5eb98 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -683,6 +684,19 @@ void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); +static inline void tcp_cleanup_skb(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); +} + +static inline void tcp_add_receive_queue(struct sock *sk, struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); + DEBUG_NET_WARN_ON_ONCE(secpath_exists(skb)); + __skb_queue_tail(&sk->sk_receive_queue, skb); +} + /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329e..32b28fc21b63c 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -178,7 +178,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +195,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98b8cc7403920..0cbf81bf3d451 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4976,7 +4976,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5168,7 +5168,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5251,7 +5251,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -6232,7 +6232,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cc2b5194a18d2..2632844d2c356 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2027,7 +2027,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); -- GitLab From 878e7b11736e062514e58f3b445ff343e6705537 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Tue, 18 Feb 2025 11:04:09 +0800 Subject: [PATCH 793/989] nfp: bpf: Add check for nfp_app_ctrl_msg_alloc() Add check for the return value of nfp_app_ctrl_msg_alloc() in nfp_bpf_cmsg_alloc() to prevent null pointer dereference. Fixes: ff3d43f7568c ("nfp: bpf: implement helpers for FW map ops") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Link: https://patch.msgid.link/20250218030409.2425798-1-haoxiang_li2024@163.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/netronome/nfp/bpf/cmsg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 2ec62c8d86e1c..59486fe2ad18c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -20,6 +20,8 @@ nfp_bpf_cmsg_alloc(struct nfp_app_bpf *bpf, unsigned int size) struct sk_buff *skb; skb = nfp_app_ctrl_msg_alloc(bpf->app, size, GFP_KERNEL); + if (!skb) + return NULL; skb_put(skb, size); return skb; -- GitLab From 14ad6ed30a10afbe91b0749d6378285f4225d482 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:39 +0100 Subject: [PATCH 794/989] net: allow small head cache usage with large MAX_SKB_FRAGS values Sabrina reported the following splat: WARNING: CPU: 0 PID: 1 at net/core/dev.c:6935 netif_napi_add_weight_locked+0x8f2/0xba0 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.14.0-rc1-net-00092-g011b03359038 #996 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:netif_napi_add_weight_locked+0x8f2/0xba0 Code: e8 c3 e6 6a fe 48 83 c4 28 5b 5d 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc c7 44 24 10 ff ff ff ff e9 8f fb ff ff e8 9e e6 6a fe <0f> 0b e9 d3 fe ff ff e8 92 e6 6a fe 48 8b 04 24 be ff ff ff ff 48 RSP: 0000:ffffc9000001fc60 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88806ce48128 RCX: 1ffff11001664b9e RDX: ffff888008f00040 RSI: ffffffff8317ca42 RDI: ffff88800b325cb6 RBP: ffff88800b325c40 R08: 0000000000000001 R09: ffffed100167502c R10: ffff88800b3a8163 R11: 0000000000000000 R12: ffff88800ac1c168 R13: ffff88800ac1c168 R14: ffff88800ac1c168 R15: 0000000000000007 FS: 0000000000000000(0000) GS:ffff88806ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888008201000 CR3: 0000000004c94001 CR4: 0000000000370ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: gro_cells_init+0x1ba/0x270 xfrm_input_init+0x4b/0x2a0 xfrm_init+0x38/0x50 ip_rt_init+0x2d7/0x350 ip_init+0xf/0x20 inet_init+0x406/0x590 do_one_initcall+0x9d/0x2e0 do_initcalls+0x23b/0x280 kernel_init_freeable+0x445/0x490 kernel_init+0x20/0x1d0 ret_from_fork+0x46/0x80 ret_from_fork_asm+0x1a/0x30 irq event stamp: 584330 hardirqs last enabled at (584338): [] __up_console_sem+0x77/0xb0 hardirqs last disabled at (584345): [] __up_console_sem+0x5c/0xb0 softirqs last enabled at (583242): [] netlink_insert+0x14d/0x470 softirqs last disabled at (583754): [] netif_napi_add_weight_locked+0x77d/0xba0 on kernel built with MAX_SKB_FRAGS=45, where SKB_WITH_OVERHEAD(1024) is smaller than GRO_MAX_HEAD. Such built additionally contains the revert of the single page frag cache so that napi_get_frags() ends up using the page frag allocator, triggering the splat. Note that the underlying issue is independent from the mentioned revert; address it ensuring that the small head cache will fit either TCP and GRO allocation and updating napi_alloc_skb() and __netdev_alloc_skb() to select kmalloc() usage for any allocation fitting such cache. Reported-by: Sabrina Dubroca Suggested-by: Eric Dumazet Fixes: 3948b05950fd ("net: introduce a config option to tweak MAX_SKB_FRAGS") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 +++ net/core/gro.c | 3 --- net/core/skbuff.c | 10 +++++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/net/gro.h b/include/net/gro.h index b9b58c1f8d190..7b548f91754bf 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -11,6 +11,9 @@ #include #include +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + struct napi_gro_cb { union { struct { diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978f..78b320b631744 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c1..f5a6d50570c4f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -736,7 +739,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -816,7 +819,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) * When the small frag allocator is available, prefer it over kmalloc * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && + len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, -- GitLab From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: [PATCH 795/989] Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - net/core/dev.c | 17 +++++++ net/core/skbuff.c | 104 ++------------------------------------ 3 files changed, 22 insertions(+), 100 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca0030..ab550a89b9bfa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 72459dd02f384..1b252e9459fdb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6991,6 +6991,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f5a6d50570c4f..7b03b64fdcb27 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -223,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -293,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -816,11 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && - len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -830,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- GitLab From 96fa9ec477ff60bed87e1441fd43e003179f3253 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 19 Feb 2025 15:43:56 +0100 Subject: [PATCH 796/989] gpiolib: don't bail out if get_direction() fails in gpiochip_add_data() Since commit 9d846b1aebbe ("gpiolib: check the return value of gpio_chip::get_direction()") we check the return value of the get_direction() callback as per its API contract. Some drivers have been observed to fail to register now as they may call get_direction() in gpiochip_add_data() in contexts where it has always silently failed. Until we audit all drivers, replace the bail-out to a kernel log warning. Fixes: 9d846b1aebbe ("gpiolib: check the return value of gpio_chip::get_direction()") Reported-by: Mark Brown Closes: https://lore.kernel.org/all/Z7VFB1nST6lbmBIo@finisterre.sirena.org.uk/ Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/all/dfe03f88-407e-4ef1-ad30-42db53bbd4e4@samsung.com/ Tested-by: Mark Brown Reviewed-by: Mark Brown Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250219144356.258635-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 5529d8b65f6fb..fc19df5a64c2b 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1059,7 +1059,15 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) { ret = gc->get_direction(gc, desc_index); if (ret < 0) - goto err_cleanup_desc_srcu; + /* + * FIXME: Bail-out here once all GPIO drivers + * are updated to not return errors in + * situations that can be considered normal + * operation. + */ + dev_warn(&gdev->dev, + "%s: get_direction failed: %d\n", + __func__, ret); assign_bit(FLAG_IS_OUT, &desc->flags, !ret); } else { -- GitLab From b4c173dfbb6c78568578ff18f9e8822d7bd0e31b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Feb 2025 11:02:58 +0100 Subject: [PATCH 797/989] fuse: don't truncate cached, mutated symlink Fuse allows the value of a symlink to change and this property is exploited by some filesystems (e.g. CVMFS). It has been observed, that sometimes after changing the symlink contents, the value is truncated to the old size. This is caused by fuse_getattr() racing with fuse_reverse_inval_inode(). fuse_reverse_inval_inode() updates the fuse_inode's attr_version, which results in fuse_change_attributes() exiting before updating the cached attributes This is okay, as the cached attributes remain invalid and the next call to fuse_change_attributes() will likely update the inode with the correct values. The reason this causes problems is that cached symlinks will be returned through page_get_link(), which truncates the symlink to inode->i_size. This is correct for filesystems that don't mutate symlinks, but in this case it causes bad behavior. The solution is to just remove this truncation. This can cause a regression in a filesystem that relies on supplying a symlink larger than the file size, but this is unlikely. If that happens we'd need to make this behavior conditional. Reported-by: Laura Promberger Tested-by: Sam Lewis Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250220100258.793363-1-mszeredi@redhat.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 2 +- fs/namei.c | 24 +++++++++++++++++++----- include/linux/fs.h | 2 ++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 198862b086ff7..3805f9b06c9d2 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1636,7 +1636,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b931..ecb7b95c2ca33 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5356,10 +5356,9 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; @@ -5378,8 +5377,23 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode, } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return page_address(page); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); +} +EXPORT_SYMBOL_GPL(page_get_link_raw); + +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f7..9346adf28f7bc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -- GitLab From 782cffeec9ad96daa64ffb2d527b2a052fb02552 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 19 Feb 2025 06:10:05 -0800 Subject: [PATCH 798/989] perf/x86/intel: Fix event constraints for LNC According to the latest event list, update the event constraint tables for Lion Cove core. The general rule (the event codes < 0x90 are restricted to counters 0-3.) has been removed. There is no restriction for most of the performance monitoring events. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Amiri Khalil Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250219141005.2446823-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 20 +++++++------------- arch/x86/events/intel/ds.c | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e86333eee2668..cdcebf30468a0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -397,34 +397,28 @@ static struct event_constraint intel_lnc_event_constraints[] = { METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x012a, 0xf), + INTEL_UEVENT_CONSTRAINT(0x012b, 0xf), INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), - /* - * Generally event codes < 0x90 are restricted to counters 0-3. - * The 0x2E and 0x3C are exception, which has no restriction. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), - INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc), INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), - INTEL_EVENT_CONSTRAINT(0xce, 0x1), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), - /* - * Generally event codes >= 0x90 are likely to have no restrictions. - * The exception are defined as above. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c2e2eae7309c3..f122882ef278f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1199,7 +1199,7 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), - INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc), INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ -- GitLab From fa808ed4e199ed17d878eb75b110bda30dd52434 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Feb 2025 14:07:37 -0800 Subject: [PATCH 799/989] KVM: arm64: Ensure a VMID is allocated before programming VTTBR_EL2 Vladimir reports that a race condition to attach a VMID to a stage-2 MMU sometimes results in a vCPU entering the guest with a VMID of 0: | CPU1 | CPU2 | | | | kvm_arch_vcpu_ioctl_run | | vcpu_load <= load VTTBR_EL2 | | kvm_vmid->id = 0 | | | kvm_arch_vcpu_ioctl_run | | vcpu_load <= load VTTBR_EL2 | | with kvm_vmid->id = 0| | kvm_arm_vmid_update <= allocates fresh | | kvm_vmid->id and | | reload VTTBR_EL2 | | | | | kvm_arm_vmid_update <= observes that kvm_vmid->id | | already allocated, | | skips reload VTTBR_EL2 Oh yeah, it's as bad as it looks. Remember that VHE loads the stage-2 MMU eagerly but a VMID only gets attached to the MMU later on in the KVM_RUN loop. Even in the "best case" where VTTBR_EL2 correctly gets reprogrammed before entering the EL1&0 regime, there is a period of time where hardware is configured with VMID 0. That's completely insane. So, rather than decorating the 'late' binding with another hack, just allocate the damn thing up front. Attaching a VMID from vcpu_load() is still rollover safe since (surprise!) it'll always get called after a vCPU was preempted. Excuse me while I go find a brown paper bag. Cc: stable@vger.kernel.org Fixes: 934bf871f011 ("KVM: arm64: Load the stage-2 MMU context in kvm_vcpu_load_vhe()") Reported-by: Vladimir Murzin Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250219220737.130842-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/arm.c | 22 ++++++++++------------ arch/arm64/kvm/vmid.c | 11 +++-------- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef1238..d919557af5e50 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1259,7 +1259,7 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bc7a37cea2420..0160b49243511 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -559,6 +559,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); + /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has @@ -1138,18 +1148,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - /* - * The VMID allocator only tracks active VMIDs per - * physical CPU, and therefore the VMID allocated may not be - * preserved on VMID roll-over if the task was preempted, - * making a thread's VMID inactive. So we need to call - * kvm_arm_vmid_update() in non-premptible context. - */ - if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && - has_vhe()) - __load_stage2(vcpu->arch.hw_mmu, - vcpu->arch.hw_mmu->arch); - kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 806223b7022af..7fe8ba1a2851c 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -135,11 +135,10 @@ void kvm_arm_vmid_clear_active(void) atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; - bool updated = false; vmid = atomic64_read(&kvm_vmid->id); @@ -157,21 +156,17 @@ bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) - return false; + return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); - if (!vmid_gen_match(vmid)) { + if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); - updated = true; - } atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); - - return updated; } /* -- GitLab From f13409bb3f9140dad7256febcb478f0c9600312c Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 14 Feb 2025 09:02:04 +0100 Subject: [PATCH 800/989] nvme-fc: rely on state transitions to handle connectivity loss It's not possible to call nvme_state_ctrl_state with holding a spin lock, because nvme_state_ctrl_state calls cancel_delayed_work_sync when fastfail is enabled. Instead syncing the ASSOC_FLAG and state transitions using a lock, it's possible to only rely on the state machine transitions. That means nvme_fc_ctrl_connectivity_loss should unconditionally call nvme_reset_ctrl which avoids the read race on the ctrl state variable. Actually, it's not necessary to test in which state the ctrl is, the reset work will only scheduled when the state machine is in LIVE state. In nvme_fc_create_association, the LIVE state can only be entered if it was previously CONNECTING. If this is not possible then the reset handler got triggered. Thus just error out here. Fixes: ee59e3820ca9 ("nvme-fc: do not ignore connectivity loss during connecting") Closes: https://lore.kernel.org/all/denqwui6sl5erqmz2gvrwueyxakl5txzbbiu3fgebryzrfxunm@iwxuthct377m/ Reported-by: Shinichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 67 ++++-------------------------------------- 1 file changed, 6 insertions(+), 61 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4f1866fbd5b8..b9929a5a7f4e3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -781,61 +781,12 @@ nvme_fc_abort_lsops(struct nvme_fc_rport *rport) static void nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) { - enum nvme_ctrl_state state; - unsigned long flags; - dev_info(ctrl->ctrl.device, "NVME-FC{%d}: controller connectivity lost. Awaiting " "Reconnect", ctrl->cnum); - spin_lock_irqsave(&ctrl->lock, flags); set_bit(ASSOC_FAILED, &ctrl->flags); - state = nvme_ctrl_state(&ctrl->ctrl); - spin_unlock_irqrestore(&ctrl->lock, flags); - - switch (state) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - /* - * Schedule a controller reset. The reset will terminate the - * association and schedule the reconnect timer. Reconnects - * will be attempted until either the ctlr_loss_tmo - * (max_retries * connect_delay) expires or the remoteport's - * dev_loss_tmo expires. - */ - if (nvme_reset_ctrl(&ctrl->ctrl)) { - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Couldn't schedule reset.\n", - ctrl->cnum); - nvme_delete_ctrl(&ctrl->ctrl); - } - break; - - case NVME_CTRL_CONNECTING: - /* - * The association has already been terminated and the - * controller is attempting reconnects. No need to do anything - * futher. Reconnects will be attempted until either the - * ctlr_loss_tmo (max_retries * connect_delay) expires or the - * remoteport's dev_loss_tmo expires. - */ - break; - - case NVME_CTRL_RESETTING: - /* - * Controller is already in the process of terminating the - * association. No need to do anything further. The reconnect - * step will kick in naturally after the association is - * terminated. - */ - break; - - case NVME_CTRL_DELETING: - case NVME_CTRL_DELETING_NOIO: - default: - /* no action to take - let it delete */ - break; - } + nvme_reset_ctrl(&ctrl->ctrl); } /** @@ -3071,7 +3022,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) struct nvmefc_ls_rcv_op *disls = NULL; unsigned long flags; int ret; - bool changed; ++ctrl->ctrl.nr_reconnects; @@ -3177,23 +3127,18 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; if (ret) goto out_term_aen_ops; - spin_lock_irqsave(&ctrl->lock, flags); - if (!test_bit(ASSOC_FAILED, &ctrl->flags)) - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - else + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE)) { ret = -EIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (ret) goto out_term_aen_ops; + } ctrl->ctrl.nr_reconnects = 0; - - if (changed) - nvme_start_ctrl(&ctrl->ctrl); + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ -- GitLab From d2fe192348f93fe3a0cb1e33e4aba58e646397f4 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 14 Feb 2025 09:02:03 +0100 Subject: [PATCH 801/989] nvme: only allow entering LIVE from CONNECTING state The fabric transports and also the PCI transport are not entering the LIVE state from NEW or RESETTING. This makes the state machine more restrictive and allows to catch not supported state transitions, e.g. directly switching from RESETTING to LIVE. Reviewed-by: Sagi Grimberg Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 818d4e49aab51..f028913e2e622 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -564,8 +564,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_LIVE: switch (old_state) { - case NVME_CTRL_NEW: - case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; fallthrough; -- GitLab From 860ca5e50f73c2a1cef7eefc9d39d04e275417f7 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Mon, 17 Feb 2025 15:20:38 +0800 Subject: [PATCH 802/989] smb: client: Add check for next_buffer in receive_encrypted_standard() Add check for the return value of cifs_buf_get() and cifs_small_buf_get() in receive_encrypted_standard() to prevent null pointer dereference. Fixes: eec04ea11969 ("smb: client: fix OOB in receive_encrypted_standard()") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 23e0c8be7fb52..4dd11eafb69d9 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4965,6 +4965,10 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_buffer = (char *)cifs_buf_get(); else next_buffer = (char *)cifs_small_buf_get(); + if (!next_buffer) { + cifs_server_dbg(VFS, "No memory for (large) SMB response\n"); + return -1; + } memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd); } -- GitLab From bd30e8d7bfa6e528f9e746c940e6f7246c7899d6 Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Fri, 14 Feb 2025 19:17:09 +0800 Subject: [PATCH 803/989] Bluetooth: Always allow SCO packets for user channel The SCO packets from Bluetooth raw socket are now rejected because hci_conn_num is left 0. This patch allows such the usecase to enable the userspace SCO support. Fixes: b16b327edb4d ("Bluetooth: btusb: add sysfs attribute to control USB alt setting") Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 90966dfbd2781..8149e53fd0a76 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2102,7 +2102,8 @@ static int btusb_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return submit_or_queue_tx_urb(hdev, urb); case HCI_SCODATA_PKT: - if (hci_conn_num(hdev, SCO_LINK) < 1) + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + hci_conn_num(hdev, SCO_LINK) < 1) return -ENODEV; urb = alloc_isoc_urb(hdev, skb); @@ -2576,7 +2577,8 @@ static int btusb_send_frame_intel(struct hci_dev *hdev, struct sk_buff *skb) return submit_or_queue_tx_urb(hdev, urb); case HCI_SCODATA_PKT: - if (hci_conn_num(hdev, SCO_LINK) < 1) + if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + hci_conn_num(hdev, SCO_LINK) < 1) return -ENODEV; urb = alloc_isoc_urb(hdev, skb); -- GitLab From b25120e1d5f2ebb3db00af557709041f47f7f3d0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 14 Feb 2025 10:30:25 -0500 Subject: [PATCH 804/989] Bluetooth: L2CAP: Fix L2CAP_ECRED_CONN_RSP response L2CAP_ECRED_CONN_RSP needs to respond DCID in the same order received as SCID but the order is reversed due to use of list_add which actually prepend channels to the list so the response is reversed: > ACL Data RX: Handle 16 flags 0x02 dlen 26 LE L2CAP: Enhanced Credit Connection Request (0x17) ident 2 len 18 PSM: 39 (0x0027) MTU: 256 MPS: 251 Credits: 65535 Source CID: 116 Source CID: 117 Source CID: 118 Source CID: 119 Source CID: 120 < ACL Data TX: Handle 16 flags 0x00 dlen 26 LE L2CAP: Enhanced Credit Connection Response (0x18) ident 2 len 18 MTU: 517 MPS: 247 Credits: 3 Result: Connection successful (0x0000) Destination CID: 68 Destination CID: 67 Destination CID: 66 Destination CID: 65 Destination CID: 64 Also make sure the response don't include channels that are not on BT_CONNECT2 since the chan->ident can be set to the same value as in the following trace: < ACL Data TX: Handle 16 flags 0x00 dlen 12 LE L2CAP: LE Flow Control Credit (0x16) ident 6 len 4 Source CID: 64 Credits: 1 ... > ACL Data RX: Handle 16 flags 0x02 dlen 18 LE L2CAP: Enhanced Credit Connection Request (0x17) ident 6 len 10 PSM: 39 (0x0027) MTU: 517 MPS: 251 Credits: 255 Source CID: 70 < ACL Data TX: Handle 16 flags 0x00 dlen 20 LE L2CAP: Enhanced Credit Connection Response (0x18) ident 6 len 12 MTU: 517 MPS: 247 Credits: 3 Result: Connection successful (0x0000) Destination CID: 64 Destination CID: 68 Closes: https://github.com/bluez/bluez/issues/1094 Fixes: 9aa9d9473f15 ("Bluetooth: L2CAP: Fix responding with wrong PDU type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fec11e576f310..b22078b679726 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -632,7 +632,8 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_hold(conn->hcon); - list_add(&chan->list, &conn->chan_l); + /* Append to the list since the order matters for ECRED */ + list_add_tail(&chan->list, &conn->chan_l); } void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) @@ -3771,7 +3772,11 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) struct l2cap_ecred_conn_rsp *rsp_flex = container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr); - if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + /* Check if channel for outgoing connection or if it wasn't deferred + * since in those cases it must be skipped. + */ + if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) || + !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags)) return; /* Reset ident so only one response is sent */ -- GitLab From 511a3444f72efdc51fa923c4b1f5f0abd545fb20 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 20 Feb 2025 15:07:57 +0100 Subject: [PATCH 805/989] MAINTAINERS: Add entry for DMEM cgroup controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cgroups controller is currently maintained through the drm-misc tree, so lets add Maxime Ripard, Natalie Vock and me as specific maintainers for dmem. We keep the cgroup mailing list CC'd on all cgroup specific patches. Acked-by: Maxime Ripard Acked-by: Natalie Vock Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Michal Koutný Link: https://patchwork.freedesktop.org/patch/msgid/20250220140757.16823-1-dev@lankhorst.se Signed-off-by: Maarten Lankhorst --- .mailmap | 1 + MAINTAINERS | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/.mailmap b/.mailmap index 399322897938d..b71ee37f805d3 100644 --- a/.mailmap +++ b/.mailmap @@ -502,6 +502,7 @@ Nadav Amit Nadia Yvette Chambers William Lee Irwin III Naoya Horiguchi Naoya Horiguchi +Natalie Vock Nathan Chancellor Naveen N Rao Naveen N Rao diff --git a/MAINTAINERS b/MAINTAINERS index 18ade2ea4f3c4..473e7814a2925 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5878,6 +5878,17 @@ F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset_prs.sh F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +CONTROL GROUP - DEVICE MEMORY CONTROLLER (DMEM) +M: Maarten Lankhorst +M: Maxime Ripard +M: Natalie Vock +L: cgroups@vger.kernel.org +L: dri-devel@lists.freedesktop.org +S: Maintained +T: git https://gitlab.freedesktop.org/drm/misc/kernel.git +F: include/linux/cgroup_dmem.h +F: kernel/cgroup/dmem.c + CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) M: Johannes Weiner M: Michal Hocko -- GitLab From 992ee3ed6e9fdd0be83a7daa5ff738e3cf86047f Mon Sep 17 00:00:00 2001 From: George Moussalem Date: Wed, 19 Feb 2025 14:09:21 +0100 Subject: [PATCH 806/989] net: phy: qcom: qca807x fix condition for DAC_DSP_BIAS_CURRENT While setting the DAC value, the wrong boolean value is evaluated to set the DSP bias current. So let's correct the conditional statement and use the right boolean value read from the DTS set in the priv. Cc: stable@vger.kernel.org Fixes: d1cb613efbd3 ("net: phy: qcom: add support for QCA807x PHY Family") Signed-off-by: George Moussalem Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250219130923.7216-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/qcom/qca807x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c index 3279de857b474..2ad8c2586d643 100644 --- a/drivers/net/phy/qcom/qca807x.c +++ b/drivers/net/phy/qcom/qca807x.c @@ -774,7 +774,7 @@ static int qca807x_config_init(struct phy_device *phydev) control_dac &= ~QCA807X_CONTROL_DAC_MASK; if (!priv->dac_full_amplitude) control_dac |= QCA807X_CONTROL_DAC_DSP_AMPLITUDE; - if (!priv->dac_full_amplitude) + if (!priv->dac_full_bias_current) control_dac |= QCA807X_CONTROL_DAC_DSP_BIAS_CURRENT; if (!priv->dac_disable_bias_current_tweak) control_dac |= QCA807X_CONTROL_DAC_BIAS_CURRENT_TWEAK; -- GitLab From e31e3f6c0ce473f7ce1e70d54ac8e3ed190509f8 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Thu, 20 Feb 2025 16:17:14 +0800 Subject: [PATCH 807/989] soc: loongson: loongson2_guts: Add check for devm_kstrdup() Add check for the return value of devm_kstrdup() in loongson2_guts_probe() to catch potential exception. Fixes: b82621ac8450 ("soc: loongson: add GUTS driver for loongson-2 platforms") Cc: stable@vger.kernel.org Signed-off-by: Haoxiang Li Link: https://lore.kernel.org/r/20250220081714.2676828-1-haoxiang_li2024@163.com Signed-off-by: Arnd Bergmann --- drivers/soc/loongson/loongson2_guts.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/soc/loongson/loongson2_guts.c b/drivers/soc/loongson/loongson2_guts.c index ae42e3a9127fc..16913c3ef65ca 100644 --- a/drivers/soc/loongson/loongson2_guts.c +++ b/drivers/soc/loongson/loongson2_guts.c @@ -114,8 +114,11 @@ static int loongson2_guts_probe(struct platform_device *pdev) if (of_property_read_string(root, "model", &machine)) of_property_read_string_index(root, "compatible", 0, &machine); of_node_put(root); - if (machine) + if (machine) { soc_dev_attr.machine = devm_kstrdup(dev, machine, GFP_KERNEL); + if (!soc_dev_attr.machine) + return -ENOMEM; + } svr = loongson2_guts_get_svr(); soc_die = loongson2_soc_die_match(svr, loongson2_soc_die); -- GitLab From 68aaa637162787dc3374080efe03366f70b344f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Feb 2025 09:17:17 -0500 Subject: [PATCH 808/989] bcachefs: print op->nonce on data update inconsistency "nonce inconstancy" is popping up again, causing us to go emergency read-only. This one looks less serious, i.e. specific to the encryption path and not indicative of a data corruption bug. But we'll need more info to track it down. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 337494facac64..642fbc60ecab1 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -340,6 +340,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct printbuf buf = PRINTBUF; prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); prt_str(&buf, "\nk: "); -- GitLab From c522093b02835f2e897b83e9764e7919edac5d08 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Thu, 20 Feb 2025 18:56:08 +0800 Subject: [PATCH 809/989] bcachefs: Fix memmove when move keys down The fix alone doesn't fix [1], but should be applied before debugging that. [1] https://syzkaller.appspot.com/bug?extid=38a0cbd267eff2d286ff Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e371e60e3133e..dece27d9db04e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -996,7 +996,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, } got_good_key: le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); } fsck_err: -- GitLab From b522f180ee2b264b771fcbd0ab67d84cdd9e580d Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 31 Jan 2025 11:07:31 -0800 Subject: [PATCH 810/989] MAINTAINERS: Change maintainer for RDT Due to job transition, I am stepping down as RDT maintainer. Add Tony as a co-maintainer. Signed-off-by: Fenghua Yu Signed-off-by: Dave Hansen Acked-by: Reinette Chatre Acked-by: Tony Luck Link: https://lore.kernel.org/all/20250131190731.3981085-1-fenghua.yu%40intel.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index feed152470f68..d1cbaeb58143a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19779,7 +19779,7 @@ F: net/rds/ F: tools/testing/selftests/net/rds/ RDT - RESOURCE ALLOCATION -M: Fenghua Yu +M: Tony Luck M: Reinette Chatre L: linux-kernel@vger.kernel.org S: Supported -- GitLab From f06e4bfd010faefa637689d2df2c727dbf6e1d27 Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Wed, 19 Feb 2025 10:07:01 +0800 Subject: [PATCH 811/989] net: stmmac: dwmac-loongson: Add fix_soc_reset() callback Loongson's DWMAC device may take nearly two seconds to complete DMA reset, however, the default waiting time for reset is 200 milliseconds. Therefore, the following error message may appear: [14.427169] dwmac-loongson-pci 0000:00:03.2: Failed to reset the dma Fixes: 803fc61df261 ("net: stmmac: dwmac-loongson: Add Loongson Multi-channels GMAC support") Cc: stable@vger.kernel.org Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Reviewed-by: Jacob Keller Acked-by: Yanteng Si Link: https://patch.msgid.link/20250219020701.15139-1-zhaoqunqin@loongson.cn Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-loongson.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index bfe6e2d631bdf..f5acfb7d4ff65 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -516,6 +516,19 @@ static int loongson_dwmac_acpi_config(struct pci_dev *pdev, return 0; } +/* Loongson's DWMAC device may take nearly two seconds to complete DMA reset */ +static int loongson_dwmac_fix_reset(void *priv, void __iomem *ioaddr) +{ + u32 value = readl(ioaddr + DMA_BUS_MODE); + + value |= DMA_BUS_MODE_SFT_RESET; + writel(value, ioaddr + DMA_BUS_MODE); + + return readl_poll_timeout(ioaddr + DMA_BUS_MODE, value, + !(value & DMA_BUS_MODE_SFT_RESET), + 10000, 2000000); +} + static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct plat_stmmacenet_data *plat; @@ -566,6 +579,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id plat->bsp_priv = ld; plat->setup = loongson_dwmac_setup; + plat->fix_soc_reset = loongson_dwmac_fix_reset; ld->dev = &pdev->dev; ld->loongson_id = readl(res.addr + GMAC_VERSION) & 0xff; -- GitLab From dce5c4afd035e8090a26e5d776b1682c0e649683 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 17 Feb 2025 10:16:28 +0800 Subject: [PATCH 812/989] scsi: core: Clear driver private data when retrying request After commit 1bad6c4a57ef ("scsi: zero per-cmd private driver data for each MQ I/O"), the xen-scsifront/virtio_scsi/snic drivers all removed code that explicitly zeroed driver-private command data. In combination with commit 464a00c9e0ad ("scsi: core: Kill DRIVER_SENSE"), after virtio_scsi performs a capacity expansion, the first request will return a unit attention to indicate that the capacity has changed. And then the original command is retried. As driver-private command data was not cleared, the request would return UA again and eventually time out and fail. Zero driver-private command data when a request is retried. Fixes: f7de50da1479 ("scsi: xen-scsifront: Remove code that zeroes driver-private command data") Fixes: c2bb87318baa ("scsi: virtio_scsi: Remove code that zeroes driver-private command data") Fixes: c3006a926468 ("scsi: snic: Remove code that zeroes driver-private command data") Signed-off-by: Ye Bin Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20250217021628.2929248-1-yebin@huaweicloud.com Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index be0890e4e7062..f1cfe0bb89b20 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1669,13 +1669,6 @@ static blk_status_t scsi_prepare_cmd(struct request *req) if (in_flight) __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); - /* - * Only clear the driver-private command data if the LLD does not supply - * a function to initialize that data. - */ - if (!shost->hostt->init_cmd_priv) - memset(cmd + 1, 0, shost->hostt->cmd_size); - cmd->prot_op = SCSI_PROT_NORMAL; if (blk_rq_bytes(req)) cmd->sc_data_direction = rq_dma_dir(req); @@ -1842,6 +1835,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; + /* + * Only clear the driver-private command data if the LLD does not supply + * a function to initialize that data. + */ + if (shost->hostt->cmd_size && !shost->hostt->init_cmd_priv) + memset(cmd + 1, 0, shost->hostt->cmd_size); + if (!(req->rq_flags & RQF_DONTPREP)) { ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) -- GitLab From fe06b7c07f3fbcce2a2ca6f7b0d543b5699ea00f Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 19 Feb 2025 16:20:47 +0530 Subject: [PATCH 813/989] scsi: ufs: core: Set default runtime/system PM levels before ufshcd_hba_init() Commit bb9850704c04 ("scsi: ufs: core: Honor runtime/system PM levels if set by host controller drivers") introduced the check for setting default PM levels only if the levels are uninitialized by the host controller drivers. But it missed the fact that the levels could be initialized to 0 (UFS_PM_LVL_0) on purpose by the controller drivers. Even though none of the drivers are doing so now, the logic should be fixed irrespectively. So set the default levels unconditionally before calling ufshcd_hba_init() API which initializes the controller drivers. It ensures that the controller drivers could override the default levels if required. Fixes: bb9850704c04 ("scsi: ufs: core: Honor runtime/system PM levels if set by host controller drivers") Reported-by: Bao D. Nguyen Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20250219105047.49932-1-manivannan.sadhasivam@linaro.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index f9303e66bb798..464f13da259aa 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10431,6 +10431,21 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) */ spin_lock_init(&hba->clk_gating.lock); + /* + * Set the default power management level for runtime and system PM. + * Host controller drivers can override them in their + * 'ufs_hba_variant_ops::init' callback. + * + * Default power saving mode is to keep UFS link in Hibern8 state + * and UFS device in sleep state. + */ + hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + UFS_SLEEP_PWR_MODE, + UIC_LINK_HIBERN8_STATE); + hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( + UFS_SLEEP_PWR_MODE, + UIC_LINK_HIBERN8_STATE); + err = ufshcd_hba_init(hba); if (err) goto out_error; @@ -10544,21 +10559,6 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) goto out_disable; } - /* - * Set the default power management level for runtime and system PM if - * not set by the host controller drivers. - * Default power saving mode is to keep UFS link in Hibern8 state - * and UFS device in sleep state. - */ - if (!hba->rpm_lvl) - hba->rpm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( - UFS_SLEEP_PWR_MODE, - UIC_LINK_HIBERN8_STATE); - if (!hba->spm_lvl) - hba->spm_lvl = ufs_get_desired_pm_lvl_for_dev_link_state( - UFS_SLEEP_PWR_MODE, - UIC_LINK_HIBERN8_STATE); - INIT_DELAYED_WORK(&hba->rpm_dev_flush_recheck_work, ufshcd_rpm_dev_flush_recheck_work); INIT_DELAYED_WORK(&hba->ufs_rtc_update_work, ufshcd_rtc_work); -- GitLab From f27a95845b01e86d67c8b014b4f41bd3327daa63 Mon Sep 17 00:00:00 2001 From: Arthur Simchaev Date: Thu, 20 Feb 2025 16:20:39 +0200 Subject: [PATCH 814/989] scsi: ufs: core: bsg: Fix crash when arpmb command fails If the device doesn't support arpmb we'll crash due to copying user data in bsg_transport_sg_io_fn(). In the case where ufs_bsg_exec_advanced_rpmb_req() returns an error, do not set the job's reply_len. Memory crash backtrace: 3,1290,531166405,-;ufshcd 0000:00:12.5: ARPMB OP failed: error code -22 4,1308,531166555,-;Call Trace: 4,1309,531166559,-; 4,1310,531166565,-; ? show_regs+0x6d/0x80 4,1311,531166575,-; ? die+0x37/0xa0 4,1312,531166583,-; ? do_trap+0xd4/0xf0 4,1313,531166593,-; ? do_error_trap+0x71/0xb0 4,1314,531166601,-; ? usercopy_abort+0x6c/0x80 4,1315,531166610,-; ? exc_invalid_op+0x52/0x80 4,1316,531166622,-; ? usercopy_abort+0x6c/0x80 4,1317,531166630,-; ? asm_exc_invalid_op+0x1b/0x20 4,1318,531166643,-; ? usercopy_abort+0x6c/0x80 4,1319,531166652,-; __check_heap_object+0xe3/0x120 4,1320,531166661,-; check_heap_object+0x185/0x1d0 4,1321,531166670,-; __check_object_size.part.0+0x72/0x150 4,1322,531166679,-; __check_object_size+0x23/0x30 4,1323,531166688,-; bsg_transport_sg_io_fn+0x314/0x3b0 Fixes: 6ff265fc5ef6 ("scsi: ufs: core: bsg: Add advanced RPMB support in ufs_bsg") Cc: stable@vger.kernel.org Reviewed-by: Bean Huo Signed-off-by: Arthur Simchaev Link: https://lore.kernel.org/r/20250220142039.250992-1-arthur.simchaev@sandisk.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs_bsg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufs_bsg.c b/drivers/ufs/core/ufs_bsg.c index 8d4ad0a3f2cf0..252186124669a 100644 --- a/drivers/ufs/core/ufs_bsg.c +++ b/drivers/ufs/core/ufs_bsg.c @@ -194,10 +194,12 @@ static int ufs_bsg_request(struct bsg_job *job) ufshcd_rpm_put_sync(hba); kfree(buff); bsg_reply->result = ret; - job->reply_len = !rpmb ? sizeof(struct ufs_bsg_reply) : sizeof(struct ufs_rpmb_reply); /* complete the job here only if no error */ - if (ret == 0) + if (ret == 0) { + job->reply_len = rpmb ? sizeof(struct ufs_rpmb_reply) : + sizeof(struct ufs_bsg_reply); bsg_job_done(job, ret, bsg_reply->reply_payload_rcv_len); + } return ret; } -- GitLab From 59f37036bb7ab3d554c24abc856aabca01126414 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:36:15 +0000 Subject: [PATCH 815/989] btrfs: fix use-after-free on inode when scanning root during em shrinking At btrfs_scan_root() we are accessing the inode's root (and fs_info) in a call to btrfs_fs_closing() after we have scheduled the inode for a delayed iput, and that can result in a use-after-free on the inode in case the cleaner kthread does the iput before we dereference the inode in the call to btrfs_fs_closing(). Fix this by using the fs_info stored already in a local variable instead of doing inode->root->fs_info. Fixes: 102044384056 ("btrfs: make the extent map shrinker run asynchronously as a work queue job") CC: stable@vger.kernel.org # 6.13+ Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae25..bee1b94a10495 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1222,8 +1222,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx fs_info->em_shrinker_last_ino = btrfs_ino(inode); btrfs_add_delayed_iput(inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); -- GitLab From c6c9c4d56483d941f567eb921434c25fc6086dfa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:04:15 +0000 Subject: [PATCH 816/989] btrfs: skip inodes without loaded extent maps when shrinking extent maps If there are inodes that don't have any loaded extent maps, we end up grabbing a reference on them and later adding a delayed iput, which wakes up the cleaner and makes it do unnecessary work. This is common when for example the inodes were open only to run stat(2) or all their extent maps were already released through the folio release callback (btrfs_release_folio()) or released by a previous run of the shrinker, or directories which never have extent maps. Reported-by: Ivan Shapovalov Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ CC: stable@vger.kernel.org # 6.13+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 78 +++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bee1b94a10495..8c6b85ffd18f6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1128,6 +1128,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1141,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1201,12 +1187,61 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,9 +1249,10 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); @@ -1227,7 +1263,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { -- GitLab From 15b3b3254d1453a8db038b7d44b311a2d6c71f98 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 15 Feb 2025 11:11:29 +0000 Subject: [PATCH 817/989] btrfs: do regular iput instead of delayed iput during extent map shrinking The extent map shrinker now runs in the system unbound workqueue and no longer in kswapd context so it can directly do an iput() on inodes even if that blocks or needs to acquire any lock (we aren't holding any locks when requesting the delayed iput from the shrinker). So we don't need to add a delayed iput, wake up the cleaner and delegate the iput() to the cleaner, which also adds extra contention on the spinlock that protects the delayed iputs list. Reported-by: Ivan Shapovalov Tested-by: Ivan Shapovalov Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8c6b85ffd18f6..7f46abbd6311b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1256,7 +1256,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; -- GitLab From b1bf18223a8340cf5d52162d320badcfe07b905d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Feb 2025 20:16:39 +1030 Subject: [PATCH 818/989] btrfs: output an error message if btrfs failed to find the seed fsid [BUG] If btrfs failed to locate the seed device for whatever reason, mounting the sprouted device will fail without any meaning error message: # mkfs.btrfs -f /dev/test/scratch1 # btrfstune -S1 /dev/test/scratch1 # mount /dev/test/scratch1 /mnt/btrfs # btrfs dev add -f /dev/test/scratch2 /mnt/btrfs # umount /mnt/btrfs # btrfs dev scan -u # btrfs mount /dev/test/scratch2 /mnt/btrfs mount: /mnt/btrfs: fsconfig system call failed: No such file or directory. dmesg(1) may have more information after failed mount system call. # dmesg -t | tail -n6 BTRFS info (device dm-5): first mount of filesystem 64252ded-5953-4868-b962-cea48f7ac4ea BTRFS info (device dm-5): using crc32c (crc32c-generic) checksum algorithm BTRFS info (device dm-5): using free-space-tree BTRFS error (device dm-5): failed to read chunk tree: -2 BTRFS error (device dm-5): open_ctree failed: -2 [CAUSE] The failure to mount is pretty straight forward, just unable to find the seed device and its fsid, caused by `btrfs dev scan -u`. But the lack of any useful info is a problem. [FIX] Just add an extra error message in open_seed_devices() to indicate the error. Now the error message would look like this: BTRFS info (device dm-4): first mount of filesystem 7769223d-4db1-4e4c-ac29-0a96f53576ab BTRFS info (device dm-4): using crc32c (crc32c-generic) checksum algorithm BTRFS info (device dm-4): using free-space-tree BTRFS error (device dm-4): failed to find fsid e87c12e6-584b-4e98-8b88-962c33a619ff when attempting to open seed devices BTRFS error (device dm-4): failed to read chunk tree: -2 BTRFS error (device dm-4): open_ctree failed: -2 Link: https://github.com/kdave/btrfs-progs/issues/959 Reviewed-by: Anand Jain Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a594f66daedf0..f6ae76815e4b5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7196,8 +7196,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) -- GitLab From efa11fd269c139e29b71ec21bc9c9c0063fde40d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Feb 2025 09:06:33 +1030 Subject: [PATCH 819/989] btrfs: fix data overwriting bug during buffered write when block size < page size [BUG] When running generic/418 with a btrfs whose block size < page size (subpage cases), it always fails. And the following minimal reproducer is more than enough to trigger it reliably: workload() { mkfs.btrfs -s 4k -f $dev > /dev/null dmesg -C mount $dev $mnt $fsstree_dir/src/dio-invalidate-cache -r -b 4096 -n 3 -i 1 -f $mnt/diotest ret=$? umount $mnt stop_trace if [ $ret -ne 0 ]; then fail fi } for (( i = 0; i < 1024; i++)); do echo "=== $i/$runtime ===" workload done [CAUSE] With extra trace printk added to the following functions: - btrfs_buffered_write() * Which folio is touched * The file offset (start) where the buffered write is at * How many bytes are copied * The content of the write (the first 2 bytes) - submit_one_sector() * Which folio is touched * The position inside the folio * The content of the page cache (the first 2 bytes) - pagecache_isize_extended() * The parameters of the function itself * The parameters of the folio_zero_range() Which are enough to show the problem: 22.158114: btrfs_buffered_write: folio pos=0 start=0 copied=4096 content=0x0101 22.158161: submit_one_sector: r/i=5/257 folio=0 pos=0 content=0x0101 22.158609: btrfs_buffered_write: folio pos=0 start=4096 copied=4096 content=0x0101 22.158634: btrfs_buffered_write: folio pos=0 start=8192 copied=4096 content=0x0101 22.158650: pagecache_isize_extended: folio=0 from=4096 to=8192 bsize=4096 zero off=4096 len=8192 22.158682: submit_one_sector: r/i=5/257 folio=0 pos=4096 content=0x0000 22.158686: submit_one_sector: r/i=5/257 folio=0 pos=8192 content=0x0101 The tool dio-invalidate-cache will start 3 threads, each doing a buffered write with 0x01 at offset 0, 4096 and 8192, do a fsync, then do a direct read, and compare the read buffer with the write buffer. Note that all 3 btrfs_buffered_write() are writing the correct 0x01 into the page cache. But at submit_one_sector(), at file offset 4096, the content is zeroed out, by pagecache_isize_extended(). The race happens like this: Thread A is writing into range [4K, 8K). Thread B is writing into range [8K, 12k). Thread A | Thread B -------------------------------------+------------------------------------ btrfs_buffered_write() | btrfs_buffered_write() |- old_isize = 4K; | |- old_isize = 4096; |- btrfs_inode_lock() | | |- write into folio range [4K, 8K) | | |- pagecache_isize_extended() | | | extend isize from 4096 to 8192 | | | no folio_zero_range() called | | |- btrfs_inode_lock() | | | |- btrfs_inode_lock() | |- write into folio range [8K, 12K) | |- pagecache_isize_extended() | | calling folio_zero_range(4K, 8K) | | This is caused by the old_isize is | | grabbed too early, without any | | inode lock. | |- btrfs_inode_unlock() The @old_isize is grabbed without inode lock, causing race between two buffered write threads and making pagecache_isize_extended() to zero range which is still containing cached data. And this is only affecting subpage btrfs, because for regular blocksize == page size case, the function pagecache_isize_extended() will do nothing if the block size >= page size. [FIX] Grab the old i_size while holding the inode lock. This means each buffered write thread will have a stable view of the old inode size, thus avoid the above race. CC: stable@vger.kernel.org # 5.15+ Fixes: 5e8b9ef30392 ("btrfs: move pos increment and pagecache extension to btrfs_buffered_write") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed3c0d6546c5d..0b568c8d24cbc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) u64 lockend; size_t num_written = 0; ssize_t ret; - loff_t old_isize = i_size_read(inode); + loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); @@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) if (ret < 0) return ret; + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); + ret = generic_write_checks(iocb, i); if (ret <= 0) goto out; -- GitLab From e9a48ea4d90be251e0d057d41665745caccb0351 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Tue, 18 Feb 2025 16:59:18 +0100 Subject: [PATCH 820/989] irqchip/qcom-pdc: Workaround hardware register bug on X1E80100 On X1E80100, there is a hardware bug in the register logic of the IRQ_ENABLE_BANK register: While read accesses work on the normal address, all write accesses must be made to a shifted address. Without a workaround for this, the wrong interrupt gets enabled in the PDC and it is impossible to wakeup from deep suspend (CX collapse). This has not caused problems so far, because the deep suspend state was not enabled. A workaround is required now since work is ongoing to fix this. The PDC has multiple "DRV" regions, each one has a size of 0x10000 and provides the same set of registers for a particular client in the system. Linux is one the clients and uses DRV region 2 on X1E. Each "bank" inside the DRV region consists of 32 interrupt pins that can be enabled using the IRQ_ENABLE_BANK register: IRQ_ENABLE_BANK[bank] = base + IRQ_ENABLE_BANK + bank * sizeof(u32) On X1E, this works as intended for read access. However, write access to most banks is shifted by 2: IRQ_ENABLE_BANK_X1E[0] = IRQ_ENABLE_BANK[-2] IRQ_ENABLE_BANK_X1E[1] = IRQ_ENABLE_BANK[-1] IRQ_ENABLE_BANK_X1E[2] = IRQ_ENABLE_BANK[0] = IRQ_ENABLE_BANK[2 - 2] IRQ_ENABLE_BANK_X1E[3] = IRQ_ENABLE_BANK[1] = IRQ_ENABLE_BANK[3 - 2] IRQ_ENABLE_BANK_X1E[4] = IRQ_ENABLE_BANK[2] = IRQ_ENABLE_BANK[4 - 2] IRQ_ENABLE_BANK_X1E[5] = IRQ_ENABLE_BANK[5] (this one works as intended) The negative indexes underflow to banks of the previous DRV/client region: IRQ_ENABLE_BANK_X1E[drv 2][bank 0] = IRQ_ENABLE_BANK[drv 2][bank -2] = IRQ_ENABLE_BANK[drv 1][bank 5-2] = IRQ_ENABLE_BANK[drv 1][bank 3] = IRQ_ENABLE_BANK[drv 1][bank 0 + 3] IRQ_ENABLE_BANK_X1E[drv 2][bank 1] = IRQ_ENABLE_BANK[drv 2][bank -1] = IRQ_ENABLE_BANK[drv 1][bank 5-1] = IRQ_ENABLE_BANK[drv 1][bank 4] = IRQ_ENABLE_BANK[drv 1][bank 1 + 3] Introduce a workaround for the bug by matching the qcom,x1e80100-pdc compatible and apply the offsets as shown above: - Bank 0...1: previous DRV region, bank += 3 - Bank 1...4: our DRV region, bank -= 2 - Bank 5: our DRV region, no fixup required The PDC node in the device tree only describes the DRV region for the Linux client, but the workaround also requires to map parts of the previous DRV region to issue writes there. To maintain compatibility with old device trees, obtain the base address of the preceeding region by applying the -0x10000 offset. Note that this is also more correct from a conceptual point of view: It does not really make use of the other region; it just issues shifted writes that end up in the registers of the Linux associated DRV region 2. Signed-off-by: Stephan Gerhold Signed-off-by: Thomas Gleixner Tested-by: Johan Hovold Link: https://lore.kernel.org/all/20250218-x1e80100-pdc-hw-wa-v2-1-29be4c98e355@linaro.org --- drivers/irqchip/qcom-pdc.c | 67 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/qcom-pdc.c b/drivers/irqchip/qcom-pdc.c index 74b2f124116e3..52d77546aacb9 100644 --- a/drivers/irqchip/qcom-pdc.c +++ b/drivers/irqchip/qcom-pdc.c @@ -21,9 +21,11 @@ #include #define PDC_MAX_GPIO_IRQS 256 +#define PDC_DRV_OFFSET 0x10000 /* Valid only on HW version < 3.2 */ #define IRQ_ENABLE_BANK 0x10 +#define IRQ_ENABLE_BANK_MAX (IRQ_ENABLE_BANK + BITS_TO_BYTES(PDC_MAX_GPIO_IRQS)) #define IRQ_i_CFG 0x110 /* Valid only on HW version >= 3.2 */ @@ -46,13 +48,20 @@ struct pdc_pin_region { static DEFINE_RAW_SPINLOCK(pdc_lock); static void __iomem *pdc_base; +static void __iomem *pdc_prev_base; static struct pdc_pin_region *pdc_region; static int pdc_region_cnt; static unsigned int pdc_version; +static bool pdc_x1e_quirk; + +static void pdc_base_reg_write(void __iomem *base, int reg, u32 i, u32 val) +{ + writel_relaxed(val, base + reg + i * sizeof(u32)); +} static void pdc_reg_write(int reg, u32 i, u32 val) { - writel_relaxed(val, pdc_base + reg + i * sizeof(u32)); + pdc_base_reg_write(pdc_base, reg, i, val); } static u32 pdc_reg_read(int reg, u32 i) @@ -60,6 +69,34 @@ static u32 pdc_reg_read(int reg, u32 i) return readl_relaxed(pdc_base + reg + i * sizeof(u32)); } +static void pdc_x1e_irq_enable_write(u32 bank, u32 enable) +{ + void __iomem *base; + + /* Remap the write access to work around a hardware bug on X1E */ + switch (bank) { + case 0 ... 1: + /* Use previous DRV (client) region and shift to bank 3-4 */ + base = pdc_prev_base; + bank += 3; + break; + case 2 ... 4: + /* Use our own region and shift to bank 0-2 */ + base = pdc_base; + bank -= 2; + break; + case 5: + /* No fixup required for bank 5 */ + base = pdc_base; + break; + default: + WARN_ON(1); + return; + } + + pdc_base_reg_write(base, IRQ_ENABLE_BANK, bank, enable); +} + static void __pdc_enable_intr(int pin_out, bool on) { unsigned long enable; @@ -72,7 +109,11 @@ static void __pdc_enable_intr(int pin_out, bool on) enable = pdc_reg_read(IRQ_ENABLE_BANK, index); __assign_bit(mask, &enable, on); - pdc_reg_write(IRQ_ENABLE_BANK, index, enable); + + if (pdc_x1e_quirk) + pdc_x1e_irq_enable_write(index, enable); + else + pdc_reg_write(IRQ_ENABLE_BANK, index, enable); } else { enable = pdc_reg_read(IRQ_i_CFG, pin_out); __assign_bit(IRQ_i_CFG_IRQ_ENABLE, &enable, on); @@ -324,10 +365,29 @@ static int qcom_pdc_init(struct device_node *node, struct device_node *parent) if (res_size > resource_size(&res)) pr_warn("%pOF: invalid reg size, please fix DT\n", node); + /* + * PDC has multiple DRV regions, each one provides the same set of + * registers for a particular client in the system. Due to a hardware + * bug on X1E, some writes to the IRQ_ENABLE_BANK register must be + * issued inside the previous region. This region belongs to + * a different client and is not described in the device tree. Map the + * region with the expected offset to preserve support for old DTs. + */ + if (of_device_is_compatible(node, "qcom,x1e80100-pdc")) { + pdc_prev_base = ioremap(res.start - PDC_DRV_OFFSET, IRQ_ENABLE_BANK_MAX); + if (!pdc_prev_base) { + pr_err("%pOF: unable to map previous PDC DRV region\n", node); + return -ENXIO; + } + + pdc_x1e_quirk = true; + } + pdc_base = ioremap(res.start, res_size); if (!pdc_base) { pr_err("%pOF: unable to map PDC registers\n", node); - return -ENXIO; + ret = -ENXIO; + goto fail; } pdc_version = pdc_reg_read(PDC_VERSION_REG, 0); @@ -363,6 +423,7 @@ static int qcom_pdc_init(struct device_node *node, struct device_node *parent) fail: kfree(pdc_region); iounmap(pdc_base); + iounmap(pdc_prev_base); return ret; } -- GitLab From d252435aca44d647d57b84de5108556f9c97614a Mon Sep 17 00:00:00 2001 From: BillXiang Date: Fri, 21 Feb 2025 18:45:38 +0800 Subject: [PATCH 821/989] riscv: KVM: Remove unnecessary vcpu kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unnecessary kick to the vCPU after writing to the vs_file of IMSIC in kvm_riscv_vcpu_aia_imsic_inject. For vCPUs that are running, writing to the vs_file directly forwards the interrupt as an MSI to them and does not need an extra kick. For vCPUs that are descheduled after emulating WFI, KVM will enable the guest external interrupt for that vCPU in kvm_riscv_aia_wakeon_hgei. This means that writing to the vs_file will cause a guest external interrupt, which will cause KVM to wake up the vCPU in hgei_interrupt to handle the interrupt properly. Signed-off-by: BillXiang Reviewed-by: Andrew Jones Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20250221104538.2147-1-xiangwencheng@lanxincomputing.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index a8085cd8215e3..29ef9c2133a93 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -974,7 +974,6 @@ int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, if (imsic->vsfile_cpu >= 0) { writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); - kvm_vcpu_kick(vcpu); } else { eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip); -- GitLab From 8510edf191d2df0822ea22d6226e4eef87562271 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 20:02:08 +0800 Subject: [PATCH 822/989] mm/filemap: fix miscalculated file range for filemap_fdatawrite_range_kick() iocb->ki_pos has been updated with the number of written bytes since generic_perform_write(). Besides __filemap_fdatawrite_range() accepts the inclusive end of the data range. Fixes: 1d4457576570 ("mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue") Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20250218120209.88093-2-jefflexu@linux.alibaba.com Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- mm/filemap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9346adf28f7bc..2788df98080f8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, - iocb->ki_pos + count); + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c1..d4564a79eb353 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -445,7 +445,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on - * @end: last (non-inclusive) index for writeback + * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. -- GitLab From 927289988068a65ccc168eda881ce60f8712707b Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 20:02:09 +0800 Subject: [PATCH 823/989] mm/truncate: don't skip dirty page in folio_unmap_invalidate() ... otherwise this is a behavior change for the previous callers of invalidate_complete_folio2(), e.g. the page invalidation routine. Fixes: 4a9e23159fd3 ("mm/truncate: add folio_unmap_invalidate() helper") Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20250218120209.88093-3-jefflexu@linux.alibaba.com Signed-off-by: Christian Brauner --- mm/truncate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index e2e115adfbc58..76d8fcd89bd00 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -548,8 +548,6 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (folio_test_dirty(folio)) - return 0; if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); -- GitLab From 517120728484df1ab8b71cba8d2cad19f52f18a1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 19 Feb 2025 22:01:24 -0800 Subject: [PATCH 824/989] x86/cpufeatures: Make AVX-VNNI depend on AVX The 'noxsave' boot option disables support for AVX, but support for the AVX-VNNI feature was still declared on CPUs that support it. Fix this. Signed-off-by: Eric Biggers Signed-off-by: Ingo Molnar Cc: Dave Hansen Link: https://lore.kernel.org/r/20250220060124.89622-1-ebiggers@kernel.org --- arch/x86/kernel/cpu/cpuid-deps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 8bd84114c2d96..df838e3bdbe02 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -45,6 +45,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, { X86_FEATURE_VAES, X86_FEATURE_AVX }, { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, -- GitLab From dc0a241ceaf3b7df6f1a7658b020c92682b75bfc Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Wed, 19 Feb 2025 15:53:26 -0500 Subject: [PATCH 825/989] rseq: Fix rseq registration with CONFIG_DEBUG_RSEQ With CONFIG_DEBUG_RSEQ=y, at rseq registration the read-only fields are copied from user-space, if this copy fails the syscall returns -EFAULT and the registration should not be activated - but it erroneously is. Move the activation of the registration after the copy of the fields to fix this bug. Fixes: 7d5265ffcd8b ("rseq: Validate read-only fields under DEBUG_RSEQ config") Signed-off-by: Michael Jeanson Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20250219205330.324770-1-mjeanson@efficios.com --- kernel/rseq.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 442aba29bc4cf..2cb16091ec0ae 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -507,9 +507,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of @@ -521,6 +518,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) return -EFAULT; #endif + /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields -- GitLab From c9876cdb3ac4dcdf3c710ff02094165982e2a557 Mon Sep 17 00:00:00 2001 From: Brian Ochoa Date: Wed, 19 Feb 2025 10:09:20 -0500 Subject: [PATCH 826/989] docs: arch/x86/sva: Fix two grammar errors under Background and FAQ - Correct "in order" to "in order to" - Append missing quantifier Signed-off-by: Brian Ochoa Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250219150920.445802-1-brianeochoa@gmail.com --- Documentation/arch/x86/sva.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/x86/sva.rst b/Documentation/arch/x86/sva.rst index 33cb050059820..6a759984d4712 100644 --- a/Documentation/arch/x86/sva.rst +++ b/Documentation/arch/x86/sva.rst @@ -25,7 +25,7 @@ to cache translations for virtual addresses. The IOMMU driver uses the mmu_notifier() support to keep the device TLB cache and the CPU cache in sync. When an ATS lookup fails for a virtual address, the device should use the PRI in order to request the virtual address to be paged into the -CPU page tables. The device must use ATS again in order the fetch the +CPU page tables. The device must use ATS again in order to fetch the translation before use. Shared Hardware Workqueues @@ -216,7 +216,7 @@ submitting work and processing completions. Single Root I/O Virtualization (SR-IOV) focuses on providing independent hardware interfaces for virtualizing hardware. Hence, it's required to be -almost fully functional interface to software supporting the traditional +an almost fully functional interface to software supporting the traditional BARs, space for interrupts via MSI-X, its own register layout. Virtual Functions (VFs) are assisted by the Physical Function (PF) driver. -- GitLab From 38b14061947fa546491656e3f5e388d4fedf8dba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Feb 2025 15:20:10 -0500 Subject: [PATCH 827/989] ftrace: Fix accounting of adding subops to a manager ops Function graph uses a subops and manager ops mechanism to attach to ftrace. The manager ops connects to ftrace and the functions it connects to is defined by a list of subops that it manages. The function hash that defines what the above ops attaches to limits the functions to attach if the hash has any content. If the hash is empty, it means to trace all functions. The creation of the manager ops hash is done by iterating over all the subops hashes. If any of the subops hashes is empty, it means that the manager ops hash must trace all functions as well. The issue is in the creation of the manager ops. When a second subops is attached, a new hash is created by starting it as NULL and adding the subops one at a time. But the NULL ops is mistaken as an empty hash, and once an empty hash is found, it stops the loop of subops and just enables all functions. # echo "f:myevent1 kernel_clone" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions kernel_clone (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 # echo "f:myevent2 schedule_timeout" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions trace_initcall_start_cb (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 run_init_process (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 try_to_run_init_process (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 x86_pmu_show_pmu_cap (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 cleanup_rapl_pmus (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 uncore_free_pcibus_map (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 uncore_types_exit (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 uncore_pci_exit.part.0 (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 kvm_shutdown (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 vmx_dump_msrs (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 vmx_cleanup_l1d_flush (1) tramp: 0xffffffffc0309000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 [..] Fix this by initializing the new hash to NULL and if the hash is NULL do not treat it as an empty hash but instead allocate by copying the content of the first sub ops. Then on subsequent iterations, the new hash will not be NULL, but the content of the previous subops. If that first subops attached to all functions, then new hash may assume that the manager ops also needs to attach to all functions. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Heiko Carstens Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250220202055.060300046@goodmis.org Fixes: 5fccc7552ccbc ("ftrace: Add subops logic to allow one ops to manage many") Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 728ecda6e8d4d..bec54dc272049 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3220,15 +3220,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3292,16 +3299,18 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has /* Return a new hash that has a union of all @ops->filter_hash entries */ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) { - struct ftrace_hash *new_hash; + struct ftrace_hash *new_hash = NULL; struct ftrace_ops *subops; + int size_bits; int ret; - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; + if (ops->func_hash->filter_hash) + size_bits = ops->func_hash->filter_hash->size_bits; + else + size_bits = FTRACE_HASH_DEFAULT_BITS; list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); + ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); if (ret < 0) { free_ftrace_hash(new_hash); return NULL; @@ -3310,7 +3319,8 @@ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) if (ftrace_hash_empty(new_hash)) break; } - return new_hash; + /* Can't return NULL as that means this failed */ + return new_hash ? : EMPTY_HASH; } /* Make @ops trace evenything except what all its subops do not trace */ @@ -3505,7 +3515,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); if (!filter_hash) return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); + ret = append_hash(&filter_hash, subops->func_hash->filter_hash, + size_bits); if (ret < 0) { free_ftrace_hash(filter_hash); return ret; -- GitLab From 8eb4b09e0bbd30981305643229fe7640ad41b667 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Feb 2025 15:20:11 -0500 Subject: [PATCH 828/989] ftrace: Do not add duplicate entries in subops manager ops Check if a function is already in the manager ops of a subops. A manager ops contains multiple subops, and if two or more subops are tracing the same function, the manager ops only needs a single entry in its hash. Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250220202055.226762894@goodmis.org Fixes: 4f554e955614f ("ftrace: Add ftrace_set_filter_ips function") Tested-by: Heiko Carstens Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bec54dc272049..6b0c25761ccb1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5718,6 +5718,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); -- GitLab From ded9140622358a154efb3a777025fa7f7ae2c2d9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Feb 2025 15:20:12 -0500 Subject: [PATCH 829/989] fprobe: Always unregister fgraph function from ops When the last fprobe is removed, it calls unregister_ftrace_graph() to remove the graph_ops from function graph. The issue is when it does so, it calls return before removing the function from its graph ops via ftrace_set_filter_ips(). This leaves the last function lingering in the fprobe's fgraph ops and if a probe is added it also enables that last function (even though the callback will just drop it, it does add unneeded overhead to make that call). # echo "f:myevent1 kernel_clone" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions kernel_clone (1) tramp: 0xffffffffc02f3000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 # echo "f:myevent2 schedule_timeout" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions kernel_clone (1) tramp: 0xffffffffc02f3000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 schedule_timeout (1) tramp: 0xffffffffc02f3000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 # > /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions # echo "f:myevent3 kmem_cache_free" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions kmem_cache_free (1) tramp: 0xffffffffc0219000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 schedule_timeout (1) tramp: 0xffffffffc0219000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 The above enabled a fprobe on kernel_clone, and then on schedule_timeout. The content of the enabled_functions shows the functions that have a callback attached to them. The fprobe attached to those functions properly. Then the fprobes were cleared, and enabled_functions was empty after that. But after adding a fprobe on kmem_cache_free, the enabled_functions shows that the schedule_timeout was attached again. This is because it was still left in the fprobe ops that is used to tell function graph what functions it wants callbacks from. Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250220202055.393254452@goodmis.org Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Tested-by: Heiko Carstens Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2560b312ad576..62e8f7d56602f 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -403,11 +403,9 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) lockdep_assert_held(&fprobe_mutex); fprobe_graph_active--; - if (!fprobe_graph_active) { - /* Q: should we unregister it ? */ + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) unregister_ftrace_graph(&fprobe_graph_ops); - return; - } ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } -- GitLab From ca26554a1498bc905c4a39fb42d55d93f3ae8df2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Feb 2025 15:20:13 -0500 Subject: [PATCH 830/989] fprobe: Fix accounting of when to unregister from function graph When adding a new fprobe, it will update the function hash to the functions the fprobe is attached to and register with function graph to have it call the registered functions. The fprobe_graph_active variable keeps track of the number of fprobes that are using function graph. If two fprobes attach to the same function, it increments the fprobe_graph_active for each of them. But when they are removed, the first fprobe to be removed will see that the function it is attached to is also used by another fprobe and it will not remove that function from function_graph. The logic will skip decrementing the fprobe_graph_active variable. This causes the fprobe_graph_active variable to not go to zero when all fprobes are removed, and in doing so it does not unregister from function graph. As the fgraph ops hash will now be empty, and an empty filter hash means all functions are enabled, this triggers function graph to add a callback to the fprobe infrastructure for every function! # echo "f:myevent1 kernel_clone" >> /sys/kernel/tracing/dynamic_events # echo "f:myevent2 kernel_clone%return" >> /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions kernel_clone (1) tramp: 0xffffffffc0024000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60 # > /sys/kernel/tracing/dynamic_events # cat /sys/kernel/tracing/enabled_functions trace_initcall_start_cb (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 run_init_process (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 try_to_run_init_process (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 x86_pmu_show_pmu_cap (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 cleanup_rapl_pmus (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 uncore_free_pcibus_map (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 uncore_types_exit (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 uncore_pci_exit.part.0 (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 kvm_shutdown (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 vmx_dump_msrs (1) tramp: 0xffffffffc0026000 (function_trace_call+0x0/0x170) ->function_trace_call+0x0/0x170 [..] # cat /sys/kernel/tracing/enabled_functions | wc -l 54702 If a fprobe is being removed and all its functions are also traced by other fprobes, still decrement the fprobe_graph_active counter. Cc: stable@vger.kernel.org Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250220202055.565129766@goodmis.org Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Closes: https://lore.kernel.org/all/20250217114918.10397-A-hca@linux.ibm.com/ Reported-by: Heiko Carstens Tested-by: Heiko Carstens Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 62e8f7d56602f..33082c4e8154e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -407,7 +407,8 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) if (!fprobe_graph_active) unregister_ftrace_graph(&fprobe_graph_ops); - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } static int symbols_cmp(const void *a, const void *b) @@ -677,8 +678,7 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - if (count) - fprobe_graph_remove_ips(addrs, count); + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; -- GitLab From e85c5e9792b942381ad92ccd0ff745b6d408a91f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 20 Feb 2025 15:20:14 -0500 Subject: [PATCH 831/989] selftests/ftrace: Update fprobe test to check enabled_functions file A few bugs were found in the fprobe accounting logic along with it using the function graph infrastructure. Update the fprobe selftest to catch those bugs in case they or something similar shows up in the future. The test now checks the enabled_functions file which shows all the functions attached to ftrace or fgraph. When enabling a fprobe, make sure that its corresponding function is also added to that file. Also add two more fprobes to enable to make sure that the fprobe logic works properly with multiple probes. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250220202055.733001756@goodmis.org Acked-by: Masami Hiramatsu (Google) Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt (Google) --- .../test.d/dynevent/add_remove_fprobe.tc | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc index dc25bcf4f9e2c..449f9d8be7462 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc @@ -7,12 +7,38 @@ echo 0 > events/enable echo > dynamic_events PLACE=$FUNCTION_FORK +PLACE2="kmem_cache_free" +PLACE3="schedule_timeout" echo "f:myevent1 $PLACE" >> dynamic_events + +# Make sure the event is attached and is the only one +grep -q $PLACE enabled_functions +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 1 ]; then + exit_fail +fi + echo "f:myevent2 $PLACE%return" >> dynamic_events +# It should till be the only attached function +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 1 ]; then + exit_fail +fi + +# add another event +echo "f:myevent3 $PLACE2" >> dynamic_events + +grep -q $PLACE2 enabled_functions +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 2 ]; then + exit_fail +fi + grep -q myevent1 dynamic_events grep -q myevent2 dynamic_events +grep -q myevent3 dynamic_events test -d events/fprobes/myevent1 test -d events/fprobes/myevent2 @@ -21,6 +47,34 @@ echo "-:myevent2" >> dynamic_events grep -q myevent1 dynamic_events ! grep -q myevent2 dynamic_events +# should still have 2 left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 2 ]; then + exit_fail +fi + echo > dynamic_events +# Should have none left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 0 ]; then + exit_fail +fi + +echo "f:myevent4 $PLACE" >> dynamic_events + +# Should only have one enabled +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 1 ]; then + exit_fail +fi + +echo > dynamic_events + +# Should have none left +cnt=`cat enabled_functions | wc -l` +if [ $cnt -ne 0 ]; then + exit_fail +fi + clear_trace -- GitLab From 57b76bedc5c52c66968183b5ef57234894c25ce7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 20 Feb 2025 15:07:49 +0100 Subject: [PATCH 832/989] ftrace: Correct preemption accounting for function tracing. The function tracer should record the preemption level at the point when the function is invoked. If the tracing subsystem decrement the preemption counter it needs to correct this before feeding the data into the trace buffer. This was broken in the commit cited below while shifting the preempt-disabled section. Use tracing_gen_ctx_dec() which properly subtracts one from the preemption counter on a preemptible kernel. Cc: stable@vger.kernel.org Cc: Wander Lairson Costa Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Thomas Gleixner Link: https://lore.kernel.org/20250220140749.pfw8qoNZ@linutronix.de Fixes: ce5e48036c9e7 ("ftrace: disable preemption when recursion locked") Signed-off-by: Sebastian Andrzej Siewior Tested-by: Wander Lairson Costa Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164d..df56f9b760109 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -216,7 +216,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); - trace_ctx = tracing_gen_ctx(); + trace_ctx = tracing_gen_ctx_dec(); data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) @@ -321,7 +321,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -347,8 +346,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); trace_function(tr, ip, parent_ip, trace_ctx); -- GitLab From 2fa6a01345b538faa7b0fae8f723bb6977312428 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Thu, 20 Feb 2025 11:15:28 +0800 Subject: [PATCH 833/989] tracing: Fix memory leak when reading set_event file kmemleak reports the following memory leak after reading set_event file: # cat /sys/kernel/tracing/set_event # cat /sys/kernel/debug/kmemleak unreferenced object 0xff110001234449e0 (size 16): comm "cat", pid 13645, jiffies 4294981880 hex dump (first 16 bytes): 01 00 00 00 00 00 00 00 a8 71 e7 84 ff ff ff ff .........q...... backtrace (crc c43abbc): __kmalloc_cache_noprof+0x3ca/0x4b0 s_start+0x72/0x2d0 seq_read_iter+0x265/0x1080 seq_read+0x2c9/0x420 vfs_read+0x166/0xc30 ksys_read+0xf4/0x1d0 do_syscall_64+0x79/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e The issue can be reproduced regardless of whether set_event is empty or not. Here is an example about the valid content of set_event. # cat /sys/kernel/tracing/set_event sched:sched_process_fork sched:sched_switch sched:sched_wakeup *:*:mod:trace_events_sample The root cause is that s_next() returns NULL when nothing is found. This results in s_stop() attempting to free a NULL pointer because its parameter is NULL. Fix the issue by freeing the memory appropriately when s_next() fails to find anything. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250220031528.7373-1-ahuang12@lenovo.com Fixes: b355247df104 ("tracing: Cache ":mod:" events for modules not loaded yet") Signed-off-by: Adrian Huang Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4cb275316e51e..513de9ceb80ef 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1591,6 +1591,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) return iter; #endif + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } @@ -1667,9 +1674,9 @@ static int s_show(struct seq_file *m, void *v) } #endif -static void s_stop(struct seq_file *m, void *p) +static void s_stop(struct seq_file *m, void *v) { - kfree(p); + kfree(v); t_stop(m, NULL); } -- GitLab From 4ecaa75771a75f2b78a431bf67dea165d19d72a6 Mon Sep 17 00:00:00 2001 From: Yu-Che Cheng Date: Wed, 19 Feb 2025 15:07:48 +0800 Subject: [PATCH 834/989] thermal: gov_power_allocator: Fix incorrect calculation in divvy_up_power() divvy_up_power() should use weighted_req_power instead of req_power to calculate granted_power. Otherwise, granted_power may be unexpected as the denominator total_req_power is a weighted sum. This is a mistake made during the previous refactor. Replace req_power with weighted_req_power in divvy_up_power() calculation. Fixes: 912e97c67cc3 ("thermal: gov_power_allocator: Move memory allocation out of throttle()") Signed-off-by: Yu-Che Cheng Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20250219-fix-power-allocator-calc-v1-1-48b860291919@chromium.org [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 3b644de3292e2..3b626db55b2b9 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -370,7 +370,7 @@ static void divvy_up_power(struct power_actor *power, int num_actors, for (i = 0; i < num_actors; i++) { struct power_actor *pa = &power[i]; - u64 req_range = (u64)pa->req_power * power_range; + u64 req_range = (u64)pa->weighted_req_power * power_range; pa->granted_power = DIV_ROUND_CLOSEST_ULL(req_range, total_req_power); -- GitLab From c34d999ca3145d9fe858258cc3342ec493f47d2e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:44 +0000 Subject: [PATCH 835/989] rxrpc: rxperf: Fix missing decoding of terminal magic cookie The rxperf RPCs seem to have a magic cookie at the end of the request that was failing to be taken account of by the unmarshalling of the request. Fix the rxperf code to expect this. Fixes: 75bfdbf2fca3 ("rxrpc: Implement an in-kernel rxperf server for testing purposes") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxperf.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 7ef93407be830..e848a4777b8c7 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -478,6 +478,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; -- GitLab From 833fefa074444b1e7f7e834cbdce59ce02562ed0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:45 +0000 Subject: [PATCH 836/989] rxrpc: peer->mtu_lock is redundant The peer->mtu_lock is only used to lock around writes to peer->max_data - and nothing else; further, all such writes take place in the I/O thread and the lock is only ever write-locked and never read-locked. In a couple of places, the write_seqcount_begin() is wrapped in preempt_disable/enable(), but not in all places. This can cause lockdep to complain: WARNING: CPU: 0 PID: 1549 at include/linux/seqlock.h:221 rxrpc_input_ack_trailer+0x305/0x430 ... RIP: 0010:rxrpc_input_ack_trailer+0x305/0x430 Fix this by just getting rid of the lock. Fixes: eeaedc5449d9 ("rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/input.c | 2 -- net/rxrpc/peer_event.c | 9 +-------- net/rxrpc/peer_object.c | 1 - 4 files changed, 1 insertion(+), 12 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5e740c4862034..a64a0cab1bf7f 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -360,7 +360,6 @@ struct rxrpc_peer { u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ unsigned int ackr_max_data; /* Maximum data advertised by peer */ - seqcount_t mtu_lock; /* Lockless MTU access management */ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ unsigned int max_data; /* Maximum packet data capacity for this peer */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 9047ba13bd31e..24aceb183c2c3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -810,9 +810,7 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb if (max_mtu < peer->max_data) { trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, rxrpc_pmtud_reduce_ack); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_mtu; - write_seqcount_end(&peer->mtu_lock); } max_data = umin(max_mtu, peer->max_data); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index bc283da9ee402..7f4729234957e 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -130,9 +130,7 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) peer->pmtud_bad = max_data + 1; trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); } } @@ -408,13 +406,8 @@ void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t a } max_data = umin(max_data, peer->ackr_max_data); - if (max_data != peer->max_data) { - preempt_disable(); - write_seqcount_begin(&peer->mtu_lock); + if (max_data != peer->max_data) peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); - preempt_enable(); - } jumbo = max_data + sizeof(struct rxrpc_jumbo_header); jumbo /= RXRPC_JUMBO_SUBPKTLEN; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 0fcc87f0409f9..2ddc8ed687429 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -235,7 +235,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; -- GitLab From 71f5409176f4ffd460689eb5423a20332d00e342 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:46 +0000 Subject: [PATCH 837/989] rxrpc: Fix locking issues with the peer record hash rxrpc_new_incoming_peer() can't use spin_lock_bh() whilst its caller has interrupts disabled. WARNING: CPU: 0 PID: 1550 at kernel/softirq.c:369 __local_bh_enable_ip+0x46/0xd0 ... Call Trace: rxrpc_alloc_incoming_call+0x1b0/0x400 rxrpc_new_incoming_call+0x1dd/0x5e0 rxrpc_input_packet+0x84a/0x920 rxrpc_io_thread+0x40d/0xb40 kthread+0x2ec/0x300 ret_from_fork+0x24/0x40 ret_from_fork_asm+0x1a/0x30 irq event stamp: 1811 hardirqs last enabled at (1809): _raw_spin_unlock_irq+0x24/0x50 hardirqs last disabled at (1810): _raw_read_lock_irq+0x17/0x70 softirqs last enabled at (1182): handle_softirqs+0x3ee/0x430 softirqs last disabled at (1811): rxrpc_new_incoming_peer+0x56/0x120 Fix this by using a plain spin_lock() instead. IRQs are held, so softirqs can't happen. Fixes: a2ea9a907260 ("rxrpc: Use irq-disabling spinlocks between app and I/O thread") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/peer_object.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 2ddc8ed687429..56e09d161a97f 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -324,10 +324,10 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(local, peer, hash_key); - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); } /* -- GitLab From add117e48df4788a86a21bd0515833c0a6db1ad1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:47 +0000 Subject: [PATCH 838/989] afs: Fix the server_list to unuse a displaced server rather than putting it When allocating and building an afs_server_list struct object from a VLDB record, we look up each server address to get the server record for it - but a server may have more than one entry in the record and we discard the duplicate pointers. Currently, however, when we discard, we only put a server record, not unuse it - but the lookup got as an active-user count. The active-user count on an afs_server_list object determines its lifetime whereas the refcount keeps the memory backing it around. Failing to reduce the active-user counter prevents the record from being cleaned up and can lead to multiple copied being seen - and pointing to deleted afs_cell objects and other such things. Fix this by switching the incorrect 'put' to an 'unuse' instead. Without this, occasionally, a dead server record can be seen in /proc/net/afs/servers and list corruption may be observed: list_del corruption. prev->next should be ffff888102423e40, but was 0000000000000000. (prev=ffff88810140cd38) Fixes: 977e5f8ed0ab ("afs: Split the usage count on struct afs_server") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/server_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 7e7e567a7f8a2..d20cd902ef949 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(volume->cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server(volume->cell->net, server, + afs_server_trace_put_slist_isort); continue; } -- GitLab From 1f0fc3374f3345ff1d150c5c56ac5016e5d3826a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 18 Feb 2025 19:22:48 +0000 Subject: [PATCH 839/989] afs: Give an afs_server object a ref on the afs_cell object it points to Give an afs_server object a ref on the afs_cell object it points to so that the cell doesn't get deleted before the server record. Whilst this is circular (cell -> vol -> server_list -> server -> cell), the ref only pins the memory, not the lifetime as that's controlled by the activity counter. When the volume's activity counter reaches 0, it detaches from the cell and discards its server list; when a cell's activity counter reaches 0, it discards its root volume. At that point, the circularity is cut. Fixes: d2ddc776a458 ("afs: Overhaul volume and server record caching and fileserver rotation") Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250218192250.296870-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/server.c | 3 +++ include/trace/events/afs.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/fs/afs/server.c b/fs/afs/server.c index 038f9d0ae3af8..4504e16b458cc 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -163,6 +163,8 @@ static struct afs_server *afs_install_server(struct afs_cell *cell, rb_insert_color(&server->uuid_rb, &net->fs_servers); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + afs_get_cell(cell, afs_cell_trace_get_server); + added_dup: write_seqlock(&net->fs_addr_lock); estate = rcu_dereference_protected(server->endpoint_state, @@ -442,6 +444,7 @@ static void afs_server_rcu(struct rcu_head *rcu) atomic_read(&server->active), afs_server_trace_free); afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); + afs_put_cell(server->cell, afs_cell_trace_put_server); kfree(server); } diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index b0db89058c911..958a2460330c0 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -174,6 +174,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ EM(afs_cell_trace_get_queue_new, "GET q-new ") \ + EM(afs_cell_trace_get_server, "GET server") \ EM(afs_cell_trace_get_vol, "GET vol ") \ EM(afs_cell_trace_insert, "INSERT ") \ EM(afs_cell_trace_manage, "MANAGE ") \ @@ -182,6 +183,7 @@ enum yfs_cm_operation { EM(afs_cell_trace_put_destroy, "PUT destry") \ EM(afs_cell_trace_put_queue_work, "PUT q-work") \ EM(afs_cell_trace_put_queue_fail, "PUT q-fail") \ + EM(afs_cell_trace_put_server, "PUT server") \ EM(afs_cell_trace_put_vol, "PUT vol ") \ EM(afs_cell_trace_see_source, "SEE source") \ EM(afs_cell_trace_see_ws, "SEE ws ") \ -- GitLab From 5c70eb5c593d64d93b178905da215a9fd288a4b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Feb 2025 13:18:54 +0000 Subject: [PATCH 840/989] net: better track kernel sockets lifetime While kernel sockets are dismantled during pernet_operations->exit(), their freeing can be delayed by any tx packets still held in qdisc or device queues, due to skb_set_owner_w() prior calls. This then trigger the following warning from ref_tracker_dir_exit() [1] To fix this, make sure that kernel sockets own a reference on net->passive. Add sk_net_refcnt_upgrade() helper, used whenever a kernel socket is converted to a refcounted one. [1] [ 136.263918][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.263918][ T35] sk_alloc+0x2b3/0x370 [ 136.263918][ T35] inet6_create+0x6ce/0x10f0 [ 136.263918][ T35] __sock_create+0x4c0/0xa30 [ 136.263918][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.263918][ T35] igmp6_net_init+0x39/0x390 [ 136.263918][ T35] ops_init+0x31e/0x590 [ 136.263918][ T35] setup_net+0x287/0x9e0 [ 136.263918][ T35] copy_net_ns+0x33f/0x570 [ 136.263918][ T35] create_new_namespaces+0x425/0x7b0 [ 136.263918][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.263918][ T35] ksys_unshare+0x57d/0xa70 [ 136.263918][ T35] __x64_sys_unshare+0x38/0x40 [ 136.263918][ T35] do_syscall_64+0xf3/0x230 [ 136.263918][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 136.263918][ T35] [ 136.343488][ T35] ref_tracker: net notrefcnt@ffff8880638f01e0 has 1/2 users at [ 136.343488][ T35] sk_alloc+0x2b3/0x370 [ 136.343488][ T35] inet6_create+0x6ce/0x10f0 [ 136.343488][ T35] __sock_create+0x4c0/0xa30 [ 136.343488][ T35] inet_ctl_sock_create+0xc2/0x250 [ 136.343488][ T35] ndisc_net_init+0xa7/0x2b0 [ 136.343488][ T35] ops_init+0x31e/0x590 [ 136.343488][ T35] setup_net+0x287/0x9e0 [ 136.343488][ T35] copy_net_ns+0x33f/0x570 [ 136.343488][ T35] create_new_namespaces+0x425/0x7b0 [ 136.343488][ T35] unshare_nsproxy_namespaces+0x124/0x180 [ 136.343488][ T35] ksys_unshare+0x57d/0xa70 [ 136.343488][ T35] __x64_sys_unshare+0x38/0x40 [ 136.343488][ T35] do_syscall_64+0xf3/0x230 [ 136.343488][ T35] entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") Reported-by: syzbot+30a19e01a97420719891@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67b72aeb.050a0220.14d86d.0283.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250220131854.4048077-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 + net/core/sock.c | 27 ++++++++++++++++++++++----- net/mptcp/subflow.c | 5 +---- net/netlink/af_netlink.c | 10 ---------- net/rds/tcp.c | 8 ++------ net/smc/af_smc.c | 5 +---- net/sunrpc/svcsock.c | 5 +---- net/sunrpc/xprtsock.c | 8 ++------ 8 files changed, 30 insertions(+), 39 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8b..7ef728324e4e7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1751,6 +1751,7 @@ static inline bool sock_allow_reclassification(const struct sock *csk) struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); +void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index eae2ae70a2e03..6c0e87f97fa4a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2246,6 +2246,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } @@ -2270,6 +2271,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2301,14 +2303,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2405,6 +2421,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fd021cf8286ef..dfcbef9c46246 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1772,10 +1772,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 85311226183a2..a53ea60d0a78d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -795,16 +795,6 @@ static int netlink_release(struct socket *sock) sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); - /* Because struct net might disappear soon, do not keep a pointer. */ - if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - /* Because of deferred_put_nlk_sk and use of work queue, - * it is possible netns will be freed before this socket. - */ - sock_net_set(sk, &init_net); - __netns_tracker_alloc(&init_net, &sk->ns_tracker, - false, GFP_KERNEL); - } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0581c53e65170..3cc2f303bf786 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } - /* Update ns_tracker to current stack trace and refcounted tracker */ - __netns_tracker_free(net, &sk->ns_tracker, false); - - sk->sk_net_refcnt = 1; - netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); + put_net(net); } rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ca6984541edbd..3e6cb35baf25a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3337,10 +3337,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) * which need net ref. */ sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); return 0; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cb3bd12f5818b..72e5a01df3d35 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1541,10 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - __netns_tracker_free(net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sock->sk); if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c60936d8cef71..940fe65b2a351 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,12 +1941,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } - if (protocol == IPPROTO_TCP) { - __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(xprt->xprt_net, 1); - } + if (protocol == IPPROTO_TCP) + sk_net_refcnt_upgrade(sock->sk); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) -- GitLab From 0e4427f8f587c4b603475468bb3aee9418574893 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 20 Feb 2025 09:25:59 +0200 Subject: [PATCH 841/989] net: loopback: Avoid sending IP packets without an Ethernet header After commit 22600596b675 ("ipv4: give an IPv4 dev to blackhole_netdev") IPv4 neighbors can be constructed on the blackhole net device, but they are constructed with an output function (neigh_direct_output()) that simply calls dev_queue_xmit(). The latter will transmit packets via 'skb->dev' which might not be the blackhole net device if dst_dev_put() switched 'dst->dev' to the blackhole net device while another CPU was using the dst entry in ip_output(), but after it already initialized 'skb->dev' from 'dst->dev'. Specifically, the following can happen: CPU1 CPU2 udp_sendmsg(sk1) udp_sendmsg(sk2) udp_send_skb() [...] ip_output() skb->dev = skb_dst(skb)->dev dst_dev_put() dst->dev = blackhole_netdev ip_finish_output2() resolves neigh on dst->dev neigh_output() neigh_direct_output() dev_queue_xmit() This will result in IPv4 packets being sent without an Ethernet header via a valid net device: tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on enp9s0, link-type EN10MB (Ethernet), snapshot length 262144 bytes 22:07:02.329668 20:00:40:11:18:fb > 45:00:00:44:f4:94, ethertype Unknown (0x58c6), length 68: 0x0000: 8dda 74ca f1ae ca6c ca6c 0098 969c 0400 ..t....l.l...... 0x0010: 0000 4730 3f18 6800 0000 0000 0000 9971 ..G0?.h........q 0x0020: c4c9 9055 a157 0a70 9ead bf83 38ca ab38 ...U.W.p....8..8 0x0030: 8add ab96 e052 .....R Fix by making sure that neighbors are constructed on top of the blackhole net device with an output function that simply consumes the packets, in a similar fashion to dst_discard_out() and blackhole_netdev_xmit(). Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Fixes: 22600596b675 ("ipv4: give an IPv4 dev to blackhole_netdev") Reported-by: Florian Meister Closes: https://lore.kernel.org/netdev/20250210084931.23a5c2e4@hermes.local/ Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250220072559.782296-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/loopback.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index c8840c3b9a1bc..f1d68153987e1 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -244,8 +244,22 @@ static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } +static int blackhole_neigh_output(struct neighbour *n, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int blackhole_neigh_construct(struct net_device *dev, + struct neighbour *n) +{ + n->output = blackhole_neigh_output; + return 0; +} + static const struct net_device_ops blackhole_netdev_ops = { .ndo_start_xmit = blackhole_netdev_xmit, + .ndo_neigh_construct = blackhole_neigh_construct, }; /* This is a dst-dummy device used specifically for invalidated -- GitLab From c180188ec02281126045414e90d08422a80f75b4 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 20 Feb 2025 12:07:52 +0100 Subject: [PATCH 842/989] net: set the minimum for net_hotdata.netdev_budget_usecs Commit 7acf8a1e8a28 ("Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq tuning") added a possibility to set net_hotdata.netdev_budget_usecs, but added no lower bound checking. Commit a4837980fd9f ("net: revert default NAPI poll timeout to 2 jiffies") made the *initial* value HZ-dependent, so the initial value is at least 2 jiffies even for lower HZ values (2 ms for 1000 Hz, 8ms for 250 Hz, 20 ms for 100 Hz). But a user still can set improper values by a sysctl. Set .extra1 (the lower bound) for net_hotdata.netdev_budget_usecs to the same value as in the latter commit. That is to 2 jiffies. Fixes: a4837980fd9f ("net: revert default NAPI poll timeout to 2 jiffies") Fixes: 7acf8a1e8a28 ("Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq tuning") Signed-off-by: Jiri Slaby (SUSE) Cc: Dmitry Yakunin Cc: Konstantin Khlebnikov Link: https://patch.msgid.link/20250220110752.137639-1-jirislaby@kernel.org Signed-off-by: Jakub Kicinski --- net/core/sysctl_net_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index ad2741f1346af..c7769ee0d9c55 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -587,7 +588,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", -- GitLab From 27843ce6ba3d3122b65066550fe33fb8839f8aef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Feb 2025 15:53:36 +0000 Subject: [PATCH 843/989] ipvlan: ensure network headers are in skb linear part syzbot found that ipvlan_process_v6_outbound() was assuming the IPv6 network header isis present in skb->head [1] Add the needed pskb_network_may_pull() calls for both IPv4 and IPv6 handlers. [1] BUG: KMSAN: uninit-value in __ipv6_addr_type+0xa2/0x490 net/ipv6/addrconf_core.c:47 __ipv6_addr_type+0xa2/0x490 net/ipv6/addrconf_core.c:47 ipv6_addr_type include/net/ipv6.h:555 [inline] ip6_route_output_flags_noref net/ipv6/route.c:2616 [inline] ip6_route_output_flags+0x51/0x720 net/ipv6/route.c:2651 ip6_route_output include/net/ip6_route.h:93 [inline] ipvlan_route_v6_outbound+0x24e/0x520 drivers/net/ipvlan/ipvlan_core.c:476 ipvlan_process_v6_outbound drivers/net/ipvlan/ipvlan_core.c:491 [inline] ipvlan_process_outbound drivers/net/ipvlan/ipvlan_core.c:541 [inline] ipvlan_xmit_mode_l3 drivers/net/ipvlan/ipvlan_core.c:605 [inline] ipvlan_queue_xmit+0xd72/0x1780 drivers/net/ipvlan/ipvlan_core.c:671 ipvlan_start_xmit+0x5b/0x210 drivers/net/ipvlan/ipvlan_main.c:223 __netdev_start_xmit include/linux/netdevice.h:5150 [inline] netdev_start_xmit include/linux/netdevice.h:5159 [inline] xmit_one net/core/dev.c:3735 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3751 sch_direct_xmit+0x399/0xd40 net/sched/sch_generic.c:343 qdisc_restart net/sched/sch_generic.c:408 [inline] __qdisc_run+0x14da/0x35d0 net/sched/sch_generic.c:416 qdisc_run+0x141/0x4d0 include/net/pkt_sched.h:127 net_tx_action+0x78b/0x940 net/core/dev.c:5484 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:561 __do_softirq+0x14/0x1a kernel/softirq.c:595 do_softirq+0x9a/0x100 kernel/softirq.c:462 __local_bh_enable_ip+0x9f/0xb0 kernel/softirq.c:389 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:919 [inline] __dev_queue_xmit+0x2758/0x57d0 net/core/dev.c:4611 dev_queue_xmit include/linux/netdevice.h:3311 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3132 [inline] packet_sendmsg+0x93e0/0xa7e0 net/packet/af_packet.c:3164 sock_sendmsg_nosec net/socket.c:718 [inline] Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Reported-by: syzbot+93ab4a777bafb9d9f960@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67b74f01.050a0220.14d86d.02d8.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Mahesh Bandewar Link: https://patch.msgid.link/20250220155336.61884-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_core.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index fd591ddb3884d..ca62188a317ad 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -416,20 +416,25 @@ struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) { - const struct iphdr *ip4h = ip_hdr(skb); struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - struct rtable *rt; int err, ret = NET_XMIT_DROP; + const struct iphdr *ip4h; + struct rtable *rt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, }; + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + goto err; + + ip4h = ip_hdr(skb); + fl4.daddr = ip4h->daddr; + fl4.saddr = ip4h->saddr; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; @@ -488,6 +493,12 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) struct net_device *dev = skb->dev; int err, ret = NET_XMIT_DROP; + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) { + DEV_STATS_INC(dev, tx_errors); + kfree_skb(skb); + return ret; + } + err = ipvlan_route_v6_outbound(dev, skb); if (unlikely(err)) { DEV_STATS_INC(dev, tx_errors); -- GitLab From fa52f15c745ce55261b92873676f64f7348cfe82 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 20 Feb 2025 11:29:50 -0500 Subject: [PATCH 844/989] net: cadence: macb: Synchronize stats calculations Stats calculations involve a RMW to add the stat update to the existing value. This is currently not protected by any synchronization mechanism, so data races are possible. Add a spinlock to protect the update. The reader side could be protected using u64_stats, but we would still need a spinlock for the update side anyway. And we always do an update immediately before reading the stats anyway. Fixes: 89e5785fc8a6 ("[PATCH] Atmel MACB ethernet driver") Signed-off-by: Sean Anderson Link: https://patch.msgid.link/20250220162950.95941-1-sean.anderson@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb.h | 2 ++ drivers/net/ethernet/cadence/macb_main.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 5740c98d8c9f0..2847278d9cd48 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -1279,6 +1279,8 @@ struct macb { struct clk *rx_clk; struct clk *tsu_clk; struct net_device *dev; + /* Protects hw_stats and ethtool_stats */ + spinlock_t stats_lock; union { struct macb_stats macb; struct gem_stats gem; diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 48496209fb164..c1f57d96e63fc 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1978,10 +1978,12 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id) if (status & MACB_BIT(ISR_ROVR)) { /* We missed at least one packet */ + spin_lock(&bp->stats_lock); if (macb_is_gem(bp)) bp->hw_stats.gem.rx_overruns++; else bp->hw_stats.macb.rx_overruns++; + spin_unlock(&bp->stats_lock); if (bp->caps & MACB_CAPS_ISR_CLEAR_ON_WRITE) queue_writel(queue, ISR, MACB_BIT(ISR_ROVR)); @@ -3102,6 +3104,7 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) if (!netif_running(bp->dev)) return nstat; + spin_lock_irq(&bp->stats_lock); gem_update_stats(bp); nstat->rx_errors = (hwstat->rx_frame_check_sequence_errors + @@ -3131,6 +3134,7 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) nstat->tx_aborted_errors = hwstat->tx_excessive_collisions; nstat->tx_carrier_errors = hwstat->tx_carrier_sense_errors; nstat->tx_fifo_errors = hwstat->tx_underrun; + spin_unlock_irq(&bp->stats_lock); return nstat; } @@ -3138,12 +3142,13 @@ static struct net_device_stats *gem_get_stats(struct macb *bp) static void gem_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { - struct macb *bp; + struct macb *bp = netdev_priv(dev); - bp = netdev_priv(dev); + spin_lock_irq(&bp->stats_lock); gem_update_stats(bp); memcpy(data, &bp->ethtool_stats, sizeof(u64) * (GEM_STATS_LEN + QUEUE_STATS_LEN * MACB_MAX_QUEUES)); + spin_unlock_irq(&bp->stats_lock); } static int gem_get_sset_count(struct net_device *dev, int sset) @@ -3193,6 +3198,7 @@ static struct net_device_stats *macb_get_stats(struct net_device *dev) return gem_get_stats(bp); /* read stats from hardware */ + spin_lock_irq(&bp->stats_lock); macb_update_stats(bp); /* Convert HW stats into netdevice stats */ @@ -3226,6 +3232,7 @@ static struct net_device_stats *macb_get_stats(struct net_device *dev) nstat->tx_carrier_errors = hwstat->tx_carrier_errors; nstat->tx_fifo_errors = hwstat->tx_underruns; /* Don't know about heartbeat or window errors... */ + spin_unlock_irq(&bp->stats_lock); return nstat; } @@ -5097,6 +5104,7 @@ static int macb_probe(struct platform_device *pdev) } } spin_lock_init(&bp->lock); + spin_lock_init(&bp->stats_lock); /* setup capabilities */ macb_configure_caps(bp, macb_config); -- GitLab From 28b04731a38c80092f47437af6c2770765e0b99f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 20 Feb 2025 16:50:12 -0800 Subject: [PATCH 845/989] MAINTAINERS: fix DWMAC S32 entry Using L: with more than a bare email address causes getmaintainer.pl to be unable to parse the entry. Fix this by doing as other entries that use this email address and convert it to an R: entry. Link: https://patch.msgid.link/20250221005012.1051897-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3864d473f52f2..ac15093537c6b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2877,7 +2877,7 @@ F: drivers/pinctrl/nxp/ ARM/NXP S32G/S32R DWMAC ETHERNET DRIVER M: Jan Petrous -L: NXP S32 Linux Team +R: s32@nxp.com S: Maintained F: Documentation/devicetree/bindings/net/nxp,s32-dwmac.yaml F: drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c -- GitLab From 781813db7909d945c33d3b035822225f3598774d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 20 Feb 2025 16:12:12 +0100 Subject: [PATCH 846/989] i2c: core: Allocate temporary client dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/i2c/i2c-core-base.c: In function ‘i2c_detect.isra’: drivers/i2c/i2c-core-base.c:2544:1: warning: the frame size of 1312 bytes is larger than 1024 bytes [-Wframe-larger-than=] 2544 | } | ^ Fix this by allocating the temporary client structure dynamically, as it is a rather large structure (1216 bytes, depending on kernel config). This is basically a revert of the to-be-fixed commit with some checkpatch improvements. Fixes: 735668f8e5c9 ("i2c: core: Allocate temp client on the stack in i2c_detect") Signed-off-by: Geert Uytterhoeven Reviewed-by: Su Hui Reviewed-by: Guenter Roeck [wsa: updated commit message, merged tags from similar patch] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 35a221e2c11c1..7ad1ad5c8c3f5 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -2506,7 +2506,7 @@ static int i2c_detect_address(struct i2c_client *temp_client, static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) { const unsigned short *address_list; - struct i2c_client temp_client; + struct i2c_client *temp_client; int i, err = 0; address_list = driver->address_list; @@ -2527,19 +2527,24 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver) return 0; /* Set up a temporary client to help detect callback */ - memset(&temp_client, 0, sizeof(temp_client)); - temp_client.adapter = adapter; + temp_client = kzalloc(sizeof(*temp_client), GFP_KERNEL); + if (!temp_client) + return -ENOMEM; + + temp_client->adapter = adapter; for (i = 0; address_list[i] != I2C_CLIENT_END; i += 1) { dev_dbg(&adapter->dev, "found normal entry for adapter %d, addr 0x%02x\n", i2c_adapter_id(adapter), address_list[i]); - temp_client.addr = address_list[i]; - err = i2c_detect_address(&temp_client, driver); + temp_client->addr = address_list[i]; + err = i2c_detect_address(temp_client, driver); if (unlikely(err)) break; } + kfree(temp_client); + return err; } -- GitLab From e1a0bdbdfdf08428f0ede5ae49c7f4139ac73ef5 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 20 Feb 2025 08:47:10 +0200 Subject: [PATCH 847/989] RDMA/mlx5: Fix bind QP error cleanup flow When there is a failure during bind QP, the cleanup flow destroys the counter regardless if it is the one that created it or not, which is problematic since if it isn't the one that created it, that counter could still be in use. Fix that by destroying the counter only if it was created during this call. Fixes: 45842fc627c7 ("IB/mlx5: Support statistic q counter configuration") Signed-off-by: Patrisious Haddad Reviewed-by: Mark Zhang Link: https://patch.msgid.link/25dfefddb0ebefa668c32e06a94d84e3216257cf.1740033937.git.leon@kernel.org Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 4f6c1968a2ee3..81cfa74147a18 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -546,6 +546,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); + bool new = false; int err; if (!counter->id) { @@ -560,6 +561,7 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return err; counter->id = MLX5_GET(alloc_q_counter_out, out, counter_set_id); + new = true; } err = mlx5_ib_qp_set_counter(qp, counter); @@ -569,8 +571,10 @@ static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter, return 0; fail_set_counter: - mlx5_ib_counter_dealloc(counter); - counter->id = 0; + if (new) { + mlx5_ib_counter_dealloc(counter); + counter->id = 0; + } return err; } -- GitLab From b66535356a4834a234f99e16a97eb51f2c6c5a7d Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Sat, 22 Feb 2025 07:20:21 -0800 Subject: [PATCH 848/989] RDMA/bnxt_re: Fix the page details for the srq created by kernel consumers While using nvme target with use_srq on, below kernel panic is noticed. [ 549.698111] bnxt_en 0000:41:00.0 enp65s0np0: FEC autoneg off encoding: Clause 91 RS(544,514) [ 566.393619] Oops: divide error: 0000 [#1] PREEMPT SMP NOPTI .. [ 566.393799] [ 566.393807] ? __die_body+0x1a/0x60 [ 566.393823] ? die+0x38/0x60 [ 566.393835] ? do_trap+0xe4/0x110 [ 566.393847] ? bnxt_qplib_alloc_init_hwq+0x1d4/0x580 [bnxt_re] [ 566.393867] ? bnxt_qplib_alloc_init_hwq+0x1d4/0x580 [bnxt_re] [ 566.393881] ? do_error_trap+0x7c/0x120 [ 566.393890] ? bnxt_qplib_alloc_init_hwq+0x1d4/0x580 [bnxt_re] [ 566.393911] ? exc_divide_error+0x34/0x50 [ 566.393923] ? bnxt_qplib_alloc_init_hwq+0x1d4/0x580 [bnxt_re] [ 566.393939] ? asm_exc_divide_error+0x16/0x20 [ 566.393966] ? bnxt_qplib_alloc_init_hwq+0x1d4/0x580 [bnxt_re] [ 566.393997] bnxt_qplib_create_srq+0xc9/0x340 [bnxt_re] [ 566.394040] bnxt_re_create_srq+0x335/0x3b0 [bnxt_re] [ 566.394057] ? srso_return_thunk+0x5/0x5f [ 566.394068] ? __init_swait_queue_head+0x4a/0x60 [ 566.394090] ib_create_srq_user+0xa7/0x150 [ib_core] [ 566.394147] nvmet_rdma_queue_connect+0x7d0/0xbe0 [nvmet_rdma] [ 566.394174] ? lock_release+0x22c/0x3f0 [ 566.394187] ? srso_return_thunk+0x5/0x5f Page size and shift info is set only for the user space SRQs. Set page size and page shift for kernel space SRQs also. Fixes: 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory allocation") Signed-off-by: Kashyap Desai Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1740237621-29291-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2de101d6e8255..6f5db32082dd7 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1870,6 +1870,8 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; srq->qplib_srq.eventq_hw_ring_id = rdev->nqr->nq[0].ring_id; + srq->qplib_srq.sg_info.pgsize = PAGE_SIZE; + srq->qplib_srq.sg_info.pgshft = PAGE_SHIFT; nq = &rdev->nqr->nq[0]; if (udata) { -- GitLab From 174e5e9da4f5946de3d09c32ee56cbbc9d70505b Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Fri, 21 Feb 2025 09:12:42 +0100 Subject: [PATCH 849/989] efi/cper: Fix cper_ia_proc_ctx alignment According to the UEFI Common Platform Error Record appendix, the IA32/X64 Processor Context Information Structure is a variable length structure, but "is padded with zeros if the size is not a multiple of 16 bytes". Currently this isn't honoured, causing all but the first structure to be garbage when printed. Thus align the size to be a multiple of 16. Signed-off-by: Patrick Rudolph Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/cper-x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c index 438ed9eff6d01..3949d7b5e808f 100644 --- a/drivers/firmware/efi/cper-x86.c +++ b/drivers/firmware/efi/cper-x86.c @@ -325,7 +325,7 @@ void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc) ctx_info = (struct cper_ia_proc_ctx *)err_info; for (i = 0; i < VALID_PROC_CXT_INFO_NUM(proc->validation_bits); i++) { - int size = sizeof(*ctx_info) + ctx_info->reg_arr_size; + int size = ALIGN(sizeof(*ctx_info) + ctx_info->reg_arr_size, 16); int groupsize = 4; printk("%sContext Information Structure %d:\n", pfx, i); -- GitLab From d6a2d02aa060531607f4a8411ec384470faa2761 Mon Sep 17 00:00:00 2001 From: Patrick Rudolph Date: Fri, 21 Feb 2025 12:15:16 +0100 Subject: [PATCH 850/989] efi/cper: Fix cper_arm_ctx_info alignment According to the UEFI Common Platform Error Record appendix, the processor context information structure is a variable length structure, but "is padded with zeros if the size is not a multiple of 16 bytes". Currently this isn't honoured, causing all but the first structure to be garbage when printed. Thus align the size to be a multiple of 16. Signed-off-by: Patrick Rudolph Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/cper-arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c index fa9c1c3bf168b..f0a63d09d3c49 100644 --- a/drivers/firmware/efi/cper-arm.c +++ b/drivers/firmware/efi/cper-arm.c @@ -311,7 +311,7 @@ void cper_print_proc_arm(const char *pfx, ctx_info = (struct cper_arm_ctx_info *)err_info; max_ctx_type = ARRAY_SIZE(arm_reg_ctx_strs) - 1; for (i = 0; i < proc->context_info_num; i++) { - int size = sizeof(*ctx_info) + ctx_info->size; + int size = ALIGN(sizeof(*ctx_info) + ctx_info->size, 16); printk("%sContext info structure %d:\n", pfx, i); if (len < size) { -- GitLab From cb6ae457bc6af58c84a7854df5e7e32ba1c6a715 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 23 Feb 2025 16:48:54 +0100 Subject: [PATCH 851/989] efivarfs: Defer PM notifier registration until .fill_super syzbot reports an issue that turns out to be caused by the fact that the efivarfs PM notifier may be invoked before the efivarfs_fs_info::sb field is populated, resulting in a NULL deference. So defer the registration until efivarfs_fill_super() is invoked. Reported-by: syzbot+00d13e505ef530a45100@syzkaller.appspotmail.com Tested-by: syzbot+00d13e505ef530a45100@syzkaller.appspotmail.com Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 09fcf731e65d6..6eae8cf655c12 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -367,6 +367,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; + register_pm_notifier(&sfi->pm_nb); + return efivar_init(efivarfs_callback, sb, true); } @@ -552,7 +554,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc) sfi->pm_nb.notifier_call = efivarfs_pm_notify; sfi->pm_nb.priority = 0; - register_pm_notifier(&sfi->pm_nb); return 0; } -- GitLab From d082ecbc71e9e0bf49883ee4afd435a77a5101b6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Feb 2025 12:32:57 -0800 Subject: [PATCH 852/989] Linux 6.14-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 96407c1d6be16..30dab4c8b0120 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* -- GitLab From cf3e6960263a2ecdf5528056b321e41557e9b03d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Feb 2025 09:05:36 -0500 Subject: [PATCH 853/989] bcachefs: fix bch2_extent_ptr_eq() Reviewed-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 620b284aa34f0..204d765dd74c8 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -704,7 +704,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, ptr1.unwritten == ptr2.unwritten && ptr1.offset == ptr2.offset && ptr1.dev == ptr2.dev && - ptr1.dev == ptr2.dev); + ptr1.gen == ptr2.gen); } void bch2_ptr_swab(struct bkey_s); -- GitLab From f15176b8b6e72ac30e14fd273282d2b72562d26b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 20 Feb 2025 19:48:15 +0100 Subject: [PATCH 854/989] net: dsa: rtl8366rb: Fix compilation problem When the kernel is compiled without LED framework support the rtl8366rb fails to build like this: rtl8366rb.o: in function `rtl8366rb_setup_led': rtl8366rb.c:953:(.text.unlikely.rtl8366rb_setup_led+0xe8): undefined reference to `led_init_default_state_get' rtl8366rb.c:980:(.text.unlikely.rtl8366rb_setup_led+0x240): undefined reference to `devm_led_classdev_register_ext' As this is constantly coming up in different randconfig builds, bite the bullet and create a separate file for the offending code, split out a header with all stuff needed both in the core driver and the leds code. Add a new bool Kconfig option for the LED compile target, such that it depends on LEDS_CLASS=y || LEDS_CLASS=RTL8366RB which make LED support always available when LEDS_CLASS is compiled into the kernel and enforce that if the LEDS_CLASS is a module, then the RTL8366RB driver needs to be a module as well so that modprobe can resolve the dependencies. Fixes: 32d617005475 ("net: dsa: realtek: add LED drivers for rtl8366rb") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502070525.xMUImayb-lkp@intel.com/ Signed-off-by: Linus Walleij Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/realtek/Kconfig | 6 + drivers/net/dsa/realtek/Makefile | 3 + drivers/net/dsa/realtek/rtl8366rb-leds.c | 177 ++++++++++++++++ drivers/net/dsa/realtek/rtl8366rb.c | 258 +---------------------- drivers/net/dsa/realtek/rtl8366rb.h | 107 ++++++++++ 5 files changed, 299 insertions(+), 252 deletions(-) create mode 100644 drivers/net/dsa/realtek/rtl8366rb-leds.c create mode 100644 drivers/net/dsa/realtek/rtl8366rb.h diff --git a/drivers/net/dsa/realtek/Kconfig b/drivers/net/dsa/realtek/Kconfig index 6989972eebc30..10687722d14c0 100644 --- a/drivers/net/dsa/realtek/Kconfig +++ b/drivers/net/dsa/realtek/Kconfig @@ -43,4 +43,10 @@ config NET_DSA_REALTEK_RTL8366RB help Select to enable support for Realtek RTL8366RB. +config NET_DSA_REALTEK_RTL8366RB_LEDS + bool "Support RTL8366RB LED control" + depends on (LEDS_CLASS=y || LEDS_CLASS=NET_DSA_REALTEK_RTL8366RB) + depends on NET_DSA_REALTEK_RTL8366RB + default NET_DSA_REALTEK_RTL8366RB + endif diff --git a/drivers/net/dsa/realtek/Makefile b/drivers/net/dsa/realtek/Makefile index 35491dc20d6d6..17367bcba496c 100644 --- a/drivers/net/dsa/realtek/Makefile +++ b/drivers/net/dsa/realtek/Makefile @@ -12,4 +12,7 @@ endif obj-$(CONFIG_NET_DSA_REALTEK_RTL8366RB) += rtl8366.o rtl8366-objs := rtl8366-core.o rtl8366rb.o +ifdef CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS +rtl8366-objs += rtl8366rb-leds.o +endif obj-$(CONFIG_NET_DSA_REALTEK_RTL8365MB) += rtl8365mb.o diff --git a/drivers/net/dsa/realtek/rtl8366rb-leds.c b/drivers/net/dsa/realtek/rtl8366rb-leds.c new file mode 100644 index 0000000000000..99c890681ae60 --- /dev/null +++ b/drivers/net/dsa/realtek/rtl8366rb-leds.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "rtl83xx.h" +#include "rtl8366rb.h" + +static inline u32 rtl8366rb_led_group_port_mask(u8 led_group, u8 port) +{ + switch (led_group) { + case 0: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 1: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 2: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + case 3: + return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); + default: + return 0; + } +} + +static int rb8366rb_get_port_led(struct rtl8366rb_led *led) +{ + struct realtek_priv *priv = led->priv; + u8 led_group = led->led_group; + u8 port_num = led->port_num; + int ret; + u32 val; + + ret = regmap_read(priv->map, RTL8366RB_LED_X_X_CTRL_REG(led_group), + &val); + if (ret) { + dev_err(priv->dev, "error reading LED on port %d group %d\n", + led_group, port_num); + return ret; + } + + return !!(val & rtl8366rb_led_group_port_mask(led_group, port_num)); +} + +static int rb8366rb_set_port_led(struct rtl8366rb_led *led, bool enable) +{ + struct realtek_priv *priv = led->priv; + u8 led_group = led->led_group; + u8 port_num = led->port_num; + int ret; + + ret = regmap_update_bits(priv->map, + RTL8366RB_LED_X_X_CTRL_REG(led_group), + rtl8366rb_led_group_port_mask(led_group, + port_num), + enable ? 0xffff : 0); + if (ret) { + dev_err(priv->dev, "error updating LED on port %d group %d\n", + led_group, port_num); + return ret; + } + + /* Change the LED group to manual controlled LEDs if required */ + ret = rb8366rb_set_ledgroup_mode(priv, led_group, + RTL8366RB_LEDGROUP_FORCE); + + if (ret) { + dev_err(priv->dev, "error updating LED GROUP group %d\n", + led_group); + return ret; + } + + return 0; +} + +static int +rtl8366rb_cled_brightness_set_blocking(struct led_classdev *ldev, + enum led_brightness brightness) +{ + struct rtl8366rb_led *led = container_of(ldev, struct rtl8366rb_led, + cdev); + + return rb8366rb_set_port_led(led, brightness == LED_ON); +} + +static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp, + struct fwnode_handle *led_fwnode) +{ + struct rtl8366rb *rb = priv->chip_data; + struct led_init_data init_data = { }; + enum led_default_state state; + struct rtl8366rb_led *led; + u32 led_group; + int ret; + + ret = fwnode_property_read_u32(led_fwnode, "reg", &led_group); + if (ret) + return ret; + + if (led_group >= RTL8366RB_NUM_LEDGROUPS) { + dev_warn(priv->dev, "Invalid LED reg %d defined for port %d", + led_group, dp->index); + return -EINVAL; + } + + led = &rb->leds[dp->index][led_group]; + led->port_num = dp->index; + led->led_group = led_group; + led->priv = priv; + + state = led_init_default_state_get(led_fwnode); + switch (state) { + case LEDS_DEFSTATE_ON: + led->cdev.brightness = 1; + rb8366rb_set_port_led(led, 1); + break; + case LEDS_DEFSTATE_KEEP: + led->cdev.brightness = + rb8366rb_get_port_led(led); + break; + case LEDS_DEFSTATE_OFF: + default: + led->cdev.brightness = 0; + rb8366rb_set_port_led(led, 0); + } + + led->cdev.max_brightness = 1; + led->cdev.brightness_set_blocking = + rtl8366rb_cled_brightness_set_blocking; + init_data.fwnode = led_fwnode; + init_data.devname_mandatory = true; + + init_data.devicename = kasprintf(GFP_KERNEL, "Realtek-%d:0%d:%d", + dp->ds->index, dp->index, led_group); + if (!init_data.devicename) + return -ENOMEM; + + ret = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data); + if (ret) { + dev_warn(priv->dev, "Failed to init LED %d for port %d", + led_group, dp->index); + return ret; + } + + return 0; +} + +int rtl8366rb_setup_leds(struct realtek_priv *priv) +{ + struct dsa_switch *ds = &priv->ds; + struct device_node *leds_np; + struct dsa_port *dp; + int ret = 0; + + dsa_switch_for_each_port(dp, ds) { + if (!dp->dn) + continue; + + leds_np = of_get_child_by_name(dp->dn, "leds"); + if (!leds_np) { + dev_dbg(priv->dev, "No leds defined for port %d", + dp->index); + continue; + } + + for_each_child_of_node_scoped(leds_np, led_np) { + ret = rtl8366rb_setup_led(priv, dp, + of_fwnode_handle(led_np)); + if (ret) + break; + } + + of_node_put(leds_np); + if (ret) + return ret; + } + return 0; +} diff --git a/drivers/net/dsa/realtek/rtl8366rb.c b/drivers/net/dsa/realtek/rtl8366rb.c index 4c4a95d4380ce..f54771cab56d4 100644 --- a/drivers/net/dsa/realtek/rtl8366rb.c +++ b/drivers/net/dsa/realtek/rtl8366rb.c @@ -27,11 +27,7 @@ #include "realtek-smi.h" #include "realtek-mdio.h" #include "rtl83xx.h" - -#define RTL8366RB_PORT_NUM_CPU 5 -#define RTL8366RB_NUM_PORTS 6 -#define RTL8366RB_PHY_NO_MAX 4 -#define RTL8366RB_PHY_ADDR_MAX 31 +#include "rtl8366rb.h" /* Switch Global Configuration register */ #define RTL8366RB_SGCR 0x0000 @@ -176,39 +172,6 @@ */ #define RTL8366RB_VLAN_INGRESS_CTRL2_REG 0x037f -/* LED control registers */ -/* The LED blink rate is global; it is used by all triggers in all groups. */ -#define RTL8366RB_LED_BLINKRATE_REG 0x0430 -#define RTL8366RB_LED_BLINKRATE_MASK 0x0007 -#define RTL8366RB_LED_BLINKRATE_28MS 0x0000 -#define RTL8366RB_LED_BLINKRATE_56MS 0x0001 -#define RTL8366RB_LED_BLINKRATE_84MS 0x0002 -#define RTL8366RB_LED_BLINKRATE_111MS 0x0003 -#define RTL8366RB_LED_BLINKRATE_222MS 0x0004 -#define RTL8366RB_LED_BLINKRATE_446MS 0x0005 - -/* LED trigger event for each group */ -#define RTL8366RB_LED_CTRL_REG 0x0431 -#define RTL8366RB_LED_CTRL_OFFSET(led_group) \ - (4 * (led_group)) -#define RTL8366RB_LED_CTRL_MASK(led_group) \ - (0xf << RTL8366RB_LED_CTRL_OFFSET(led_group)) - -/* The RTL8366RB_LED_X_X registers are used to manually set the LED state only - * when the corresponding LED group in RTL8366RB_LED_CTRL_REG is - * RTL8366RB_LEDGROUP_FORCE. Otherwise, it is ignored. - */ -#define RTL8366RB_LED_0_1_CTRL_REG 0x0432 -#define RTL8366RB_LED_2_3_CTRL_REG 0x0433 -#define RTL8366RB_LED_X_X_CTRL_REG(led_group) \ - ((led_group) <= 1 ? \ - RTL8366RB_LED_0_1_CTRL_REG : \ - RTL8366RB_LED_2_3_CTRL_REG) -#define RTL8366RB_LED_0_X_CTRL_MASK GENMASK(5, 0) -#define RTL8366RB_LED_X_1_CTRL_MASK GENMASK(11, 6) -#define RTL8366RB_LED_2_X_CTRL_MASK GENMASK(5, 0) -#define RTL8366RB_LED_X_3_CTRL_MASK GENMASK(11, 6) - #define RTL8366RB_MIB_COUNT 33 #define RTL8366RB_GLOBAL_MIB_COUNT 1 #define RTL8366RB_MIB_COUNTER_PORT_OFFSET 0x0050 @@ -244,7 +207,6 @@ #define RTL8366RB_PORT_STATUS_AN_MASK 0x0080 #define RTL8366RB_NUM_VLANS 16 -#define RTL8366RB_NUM_LEDGROUPS 4 #define RTL8366RB_NUM_VIDS 4096 #define RTL8366RB_PRIORITYMAX 7 #define RTL8366RB_NUM_FIDS 8 @@ -351,46 +313,6 @@ #define RTL8366RB_GREEN_FEATURE_TX BIT(0) #define RTL8366RB_GREEN_FEATURE_RX BIT(2) -enum rtl8366_ledgroup_mode { - RTL8366RB_LEDGROUP_OFF = 0x0, - RTL8366RB_LEDGROUP_DUP_COL = 0x1, - RTL8366RB_LEDGROUP_LINK_ACT = 0x2, - RTL8366RB_LEDGROUP_SPD1000 = 0x3, - RTL8366RB_LEDGROUP_SPD100 = 0x4, - RTL8366RB_LEDGROUP_SPD10 = 0x5, - RTL8366RB_LEDGROUP_SPD1000_ACT = 0x6, - RTL8366RB_LEDGROUP_SPD100_ACT = 0x7, - RTL8366RB_LEDGROUP_SPD10_ACT = 0x8, - RTL8366RB_LEDGROUP_SPD100_10_ACT = 0x9, - RTL8366RB_LEDGROUP_FIBER = 0xa, - RTL8366RB_LEDGROUP_AN_FAULT = 0xb, - RTL8366RB_LEDGROUP_LINK_RX = 0xc, - RTL8366RB_LEDGROUP_LINK_TX = 0xd, - RTL8366RB_LEDGROUP_MASTER = 0xe, - RTL8366RB_LEDGROUP_FORCE = 0xf, - - __RTL8366RB_LEDGROUP_MODE_MAX -}; - -struct rtl8366rb_led { - u8 port_num; - u8 led_group; - struct realtek_priv *priv; - struct led_classdev cdev; -}; - -/** - * struct rtl8366rb - RTL8366RB-specific data - * @max_mtu: per-port max MTU setting - * @pvid_enabled: if PVID is set for respective port - * @leds: per-port and per-ledgroup led info - */ -struct rtl8366rb { - unsigned int max_mtu[RTL8366RB_NUM_PORTS]; - bool pvid_enabled[RTL8366RB_NUM_PORTS]; - struct rtl8366rb_led leds[RTL8366RB_NUM_PORTS][RTL8366RB_NUM_LEDGROUPS]; -}; - static struct rtl8366_mib_counter rtl8366rb_mib_counters[] = { { 0, 0, 4, "IfInOctets" }, { 0, 4, 4, "EtherStatsOctets" }, @@ -831,9 +753,10 @@ static int rtl8366rb_jam_table(const struct rtl8366rb_jam_tbl_entry *jam_table, return 0; } -static int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, - u8 led_group, - enum rtl8366_ledgroup_mode mode) +/* This code is used also with LEDs disabled */ +int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, + u8 led_group, + enum rtl8366_ledgroup_mode mode) { int ret; u32 val; @@ -850,144 +773,7 @@ static int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, return 0; } -static inline u32 rtl8366rb_led_group_port_mask(u8 led_group, u8 port) -{ - switch (led_group) { - case 0: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 1: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 2: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - case 3: - return FIELD_PREP(RTL8366RB_LED_0_X_CTRL_MASK, BIT(port)); - default: - return 0; - } -} - -static int rb8366rb_get_port_led(struct rtl8366rb_led *led) -{ - struct realtek_priv *priv = led->priv; - u8 led_group = led->led_group; - u8 port_num = led->port_num; - int ret; - u32 val; - - ret = regmap_read(priv->map, RTL8366RB_LED_X_X_CTRL_REG(led_group), - &val); - if (ret) { - dev_err(priv->dev, "error reading LED on port %d group %d\n", - led_group, port_num); - return ret; - } - - return !!(val & rtl8366rb_led_group_port_mask(led_group, port_num)); -} - -static int rb8366rb_set_port_led(struct rtl8366rb_led *led, bool enable) -{ - struct realtek_priv *priv = led->priv; - u8 led_group = led->led_group; - u8 port_num = led->port_num; - int ret; - - ret = regmap_update_bits(priv->map, - RTL8366RB_LED_X_X_CTRL_REG(led_group), - rtl8366rb_led_group_port_mask(led_group, - port_num), - enable ? 0xffff : 0); - if (ret) { - dev_err(priv->dev, "error updating LED on port %d group %d\n", - led_group, port_num); - return ret; - } - - /* Change the LED group to manual controlled LEDs if required */ - ret = rb8366rb_set_ledgroup_mode(priv, led_group, - RTL8366RB_LEDGROUP_FORCE); - - if (ret) { - dev_err(priv->dev, "error updating LED GROUP group %d\n", - led_group); - return ret; - } - - return 0; -} - -static int -rtl8366rb_cled_brightness_set_blocking(struct led_classdev *ldev, - enum led_brightness brightness) -{ - struct rtl8366rb_led *led = container_of(ldev, struct rtl8366rb_led, - cdev); - - return rb8366rb_set_port_led(led, brightness == LED_ON); -} - -static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp, - struct fwnode_handle *led_fwnode) -{ - struct rtl8366rb *rb = priv->chip_data; - struct led_init_data init_data = { }; - enum led_default_state state; - struct rtl8366rb_led *led; - u32 led_group; - int ret; - - ret = fwnode_property_read_u32(led_fwnode, "reg", &led_group); - if (ret) - return ret; - - if (led_group >= RTL8366RB_NUM_LEDGROUPS) { - dev_warn(priv->dev, "Invalid LED reg %d defined for port %d", - led_group, dp->index); - return -EINVAL; - } - - led = &rb->leds[dp->index][led_group]; - led->port_num = dp->index; - led->led_group = led_group; - led->priv = priv; - - state = led_init_default_state_get(led_fwnode); - switch (state) { - case LEDS_DEFSTATE_ON: - led->cdev.brightness = 1; - rb8366rb_set_port_led(led, 1); - break; - case LEDS_DEFSTATE_KEEP: - led->cdev.brightness = - rb8366rb_get_port_led(led); - break; - case LEDS_DEFSTATE_OFF: - default: - led->cdev.brightness = 0; - rb8366rb_set_port_led(led, 0); - } - - led->cdev.max_brightness = 1; - led->cdev.brightness_set_blocking = - rtl8366rb_cled_brightness_set_blocking; - init_data.fwnode = led_fwnode; - init_data.devname_mandatory = true; - - init_data.devicename = kasprintf(GFP_KERNEL, "Realtek-%d:0%d:%d", - dp->ds->index, dp->index, led_group); - if (!init_data.devicename) - return -ENOMEM; - - ret = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data); - if (ret) { - dev_warn(priv->dev, "Failed to init LED %d for port %d", - led_group, dp->index); - return ret; - } - - return 0; -} - +/* This code is used also with LEDs disabled */ static int rtl8366rb_setup_all_leds_off(struct realtek_priv *priv) { int ret = 0; @@ -1008,38 +794,6 @@ static int rtl8366rb_setup_all_leds_off(struct realtek_priv *priv) return ret; } -static int rtl8366rb_setup_leds(struct realtek_priv *priv) -{ - struct dsa_switch *ds = &priv->ds; - struct device_node *leds_np; - struct dsa_port *dp; - int ret = 0; - - dsa_switch_for_each_port(dp, ds) { - if (!dp->dn) - continue; - - leds_np = of_get_child_by_name(dp->dn, "leds"); - if (!leds_np) { - dev_dbg(priv->dev, "No leds defined for port %d", - dp->index); - continue; - } - - for_each_child_of_node_scoped(leds_np, led_np) { - ret = rtl8366rb_setup_led(priv, dp, - of_fwnode_handle(led_np)); - if (ret) - break; - } - - of_node_put(leds_np); - if (ret) - return ret; - } - return 0; -} - static int rtl8366rb_setup(struct dsa_switch *ds) { struct realtek_priv *priv = ds->priv; diff --git a/drivers/net/dsa/realtek/rtl8366rb.h b/drivers/net/dsa/realtek/rtl8366rb.h new file mode 100644 index 0000000000000..685ff3275faa1 --- /dev/null +++ b/drivers/net/dsa/realtek/rtl8366rb.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _RTL8366RB_H +#define _RTL8366RB_H + +#include "realtek.h" + +#define RTL8366RB_PORT_NUM_CPU 5 +#define RTL8366RB_NUM_PORTS 6 +#define RTL8366RB_PHY_NO_MAX 4 +#define RTL8366RB_NUM_LEDGROUPS 4 +#define RTL8366RB_PHY_ADDR_MAX 31 + +/* LED control registers */ +/* The LED blink rate is global; it is used by all triggers in all groups. */ +#define RTL8366RB_LED_BLINKRATE_REG 0x0430 +#define RTL8366RB_LED_BLINKRATE_MASK 0x0007 +#define RTL8366RB_LED_BLINKRATE_28MS 0x0000 +#define RTL8366RB_LED_BLINKRATE_56MS 0x0001 +#define RTL8366RB_LED_BLINKRATE_84MS 0x0002 +#define RTL8366RB_LED_BLINKRATE_111MS 0x0003 +#define RTL8366RB_LED_BLINKRATE_222MS 0x0004 +#define RTL8366RB_LED_BLINKRATE_446MS 0x0005 + +/* LED trigger event for each group */ +#define RTL8366RB_LED_CTRL_REG 0x0431 +#define RTL8366RB_LED_CTRL_OFFSET(led_group) \ + (4 * (led_group)) +#define RTL8366RB_LED_CTRL_MASK(led_group) \ + (0xf << RTL8366RB_LED_CTRL_OFFSET(led_group)) + +/* The RTL8366RB_LED_X_X registers are used to manually set the LED state only + * when the corresponding LED group in RTL8366RB_LED_CTRL_REG is + * RTL8366RB_LEDGROUP_FORCE. Otherwise, it is ignored. + */ +#define RTL8366RB_LED_0_1_CTRL_REG 0x0432 +#define RTL8366RB_LED_2_3_CTRL_REG 0x0433 +#define RTL8366RB_LED_X_X_CTRL_REG(led_group) \ + ((led_group) <= 1 ? \ + RTL8366RB_LED_0_1_CTRL_REG : \ + RTL8366RB_LED_2_3_CTRL_REG) +#define RTL8366RB_LED_0_X_CTRL_MASK GENMASK(5, 0) +#define RTL8366RB_LED_X_1_CTRL_MASK GENMASK(11, 6) +#define RTL8366RB_LED_2_X_CTRL_MASK GENMASK(5, 0) +#define RTL8366RB_LED_X_3_CTRL_MASK GENMASK(11, 6) + +enum rtl8366_ledgroup_mode { + RTL8366RB_LEDGROUP_OFF = 0x0, + RTL8366RB_LEDGROUP_DUP_COL = 0x1, + RTL8366RB_LEDGROUP_LINK_ACT = 0x2, + RTL8366RB_LEDGROUP_SPD1000 = 0x3, + RTL8366RB_LEDGROUP_SPD100 = 0x4, + RTL8366RB_LEDGROUP_SPD10 = 0x5, + RTL8366RB_LEDGROUP_SPD1000_ACT = 0x6, + RTL8366RB_LEDGROUP_SPD100_ACT = 0x7, + RTL8366RB_LEDGROUP_SPD10_ACT = 0x8, + RTL8366RB_LEDGROUP_SPD100_10_ACT = 0x9, + RTL8366RB_LEDGROUP_FIBER = 0xa, + RTL8366RB_LEDGROUP_AN_FAULT = 0xb, + RTL8366RB_LEDGROUP_LINK_RX = 0xc, + RTL8366RB_LEDGROUP_LINK_TX = 0xd, + RTL8366RB_LEDGROUP_MASTER = 0xe, + RTL8366RB_LEDGROUP_FORCE = 0xf, + + __RTL8366RB_LEDGROUP_MODE_MAX +}; + +#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS) + +struct rtl8366rb_led { + u8 port_num; + u8 led_group; + struct realtek_priv *priv; + struct led_classdev cdev; +}; + +int rtl8366rb_setup_leds(struct realtek_priv *priv); + +#else + +static inline int rtl8366rb_setup_leds(struct realtek_priv *priv) +{ + return 0; +} + +#endif /* IS_ENABLED(CONFIG_LEDS_CLASS) */ + +/** + * struct rtl8366rb - RTL8366RB-specific data + * @max_mtu: per-port max MTU setting + * @pvid_enabled: if PVID is set for respective port + * @leds: per-port and per-ledgroup led info + */ +struct rtl8366rb { + unsigned int max_mtu[RTL8366RB_NUM_PORTS]; + bool pvid_enabled[RTL8366RB_NUM_PORTS]; +#if IS_ENABLED(CONFIG_NET_DSA_REALTEK_RTL8366RB_LEDS) + struct rtl8366rb_led leds[RTL8366RB_NUM_PORTS][RTL8366RB_NUM_LEDGROUPS]; +#endif +}; + +/* This code is used also with LEDs disabled */ +int rb8366rb_set_ledgroup_mode(struct realtek_priv *priv, + u8 led_group, + enum rtl8366_ledgroup_mode mode); + +#endif /* _RTL8366RB_H */ -- GitLab From 02cfe2b6529c6c5fcf39d52a826927f4f93392af Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Feb 2025 11:27:02 +0100 Subject: [PATCH 855/989] pidfs: remove d_op->d_delete Pidfs only deals with unhashed dentries and there's currently no way for them to become hashed. So remove d_op->d_delete. Signed-off-by: Christian Brauner --- fs/pidfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 63f9699ebac36..c0478b3c55d9f 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -521,7 +521,6 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations pidfs_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; -- GitLab From 425e3e3bd62c568a4365af0923d6ebad71a7dcfc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Feb 2025 11:30:35 +0100 Subject: [PATCH 856/989] nsfs: remove d_op->d_delete Nsfs only deals with unhashed dentries and there's currently no way for them to become hashed. So remove d_op->d_delete. Signed-off-by: Christian Brauner --- fs/nsfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 663f8656158d5..f7fddf8ecf73f 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -37,7 +37,6 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations ns_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; -- GitLab From 36e1b81f599a093ec7477e4593e110104adcfb96 Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Wed, 19 Feb 2025 17:56:00 -0500 Subject: [PATCH 857/989] dm vdo: add missing spin_lock_init Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-vdo/dedupe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index b6f8e2dc7729f..3f3d29af1be47 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones) vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval); vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval); + spin_lock_init(&zones->lock); /* * Since we will save up the timeouts that would have been reported but were ratelimited, -- GitLab From 0fe8813baf4b2e865d3b2c735ce1a15b86002c74 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 17 Jan 2025 06:41:07 -0800 Subject: [PATCH 858/989] perf/core: Add RCU read lock protection to perf_iterate_ctx() The perf_iterate_ctx() function performs RCU list traversal but currently lacks RCU read lock protection. This causes lockdep warnings when running perf probe with unshare(1) under CONFIG_PROVE_RCU_LIST=y: WARNING: suspicious RCU usage kernel/events/core.c:8168 RCU-list traversed in non-reader section!! Call Trace: lockdep_rcu_suspicious ? perf_event_addr_filters_apply perf_iterate_ctx perf_event_exec begin_new_exec ? load_elf_phdrs load_elf_binary ? lock_acquire ? find_held_lock ? bprm_execve bprm_execve do_execveat_common.isra.0 __x64_sys_execve do_syscall_64 entry_SYSCALL_64_after_hwframe This protection was previously present but was removed in commit bd2756811766 ("perf: Rewrite core context handling"). Add back the necessary rcu_read_lock()/rcu_read_unlock() pair around perf_iterate_ctx() call in perf_event_exec(). [ mingo: Use scoped_guard() as suggested by Peter ] Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117-fix_perf_rcu-v1-1-13cb9210fc6a@debian.org --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e1..7dabbcaf825a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8321,7 +8321,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); -- GitLab From 2016066c66192a99d9e0ebf433789c490a6785a2 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 22 Jan 2025 07:33:56 +0000 Subject: [PATCH 859/989] perf/core: Order the PMU list to fix warning about unordered pmu_ctx_list Syskaller triggers a warning due to prev_epc->pmu != next_epc->pmu in perf_event_swap_task_ctx_data(). vmcore shows that two lists have the same perf_event_pmu_context, but not in the same order. The problem is that the order of pmu_ctx_list for the parent is impacted by the time when an event/PMU is added. While the order for a child is impacted by the event order in the pinned_groups and flexible_groups. So the order of pmu_ctx_list in the parent and child may be different. To fix this problem, insert the perf_event_pmu_context to its proper place after iteration of the pmu_ctx_list. The follow testcase can trigger above warning: # perf record -e cycles --call-graph lbr -- taskset -c 3 ./a.out & # perf stat -e cpu-clock,cs -p xxx // xxx is the pid of a.out test.c void main() { int count = 0; pid_t pid; printf("%d running\n", getpid()); sleep(30); printf("running\n"); pid = fork(); if (pid == -1) { printf("fork error\n"); return; } if (pid == 0) { while (1) { count++; } } else { while (1) { count++; } } } The testcase first opens an LBR event, so it will allocate task_ctx_data, and then open tracepoint and software events, so the parent context will have 3 different perf_event_pmu_contexts. On inheritance, child ctx will insert the perf_event_pmu_context in another order and the warning will trigger. [ mingo: Tidied up the changelog. ] Fixes: bd2756811766 ("perf: Rewrite core context handling") Signed-off-by: Luo Gengkun Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250122073356.1824736-1-luogengkun@huaweicloud.com --- kernel/events/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7dabbcaf825a0..086d46d096963 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4950,7 +4950,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; void *task_ctx_data = NULL; if (!ctx->task) { @@ -5007,12 +5007,19 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: -- GitLab From bddf10d26e6e5114e7415a0e442ec6f51a559468 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Mon, 24 Feb 2025 11:11:49 +0800 Subject: [PATCH 860/989] uprobes: Reject the shared zeropage in uprobe_write_opcode() We triggered the following crash in syzkaller tests: BUG: Bad page state in process syz.7.38 pfn:1eff3 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1eff3 flags: 0x3fffff00004004(referenced|reserved|node=0|zone=1|lastcpupid=0x1fffff) raw: 003fffff00004004 ffffe6c6c07bfcc8 ffffe6c6c07bfcc8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000fffffffe 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x32/0x50 bad_page+0x69/0xf0 free_unref_page_prepare+0x401/0x500 free_unref_page+0x6d/0x1b0 uprobe_write_opcode+0x460/0x8e0 install_breakpoint.part.0+0x51/0x80 register_for_each_vma+0x1d9/0x2b0 __uprobe_register+0x245/0x300 bpf_uprobe_multi_link_attach+0x29b/0x4f0 link_create+0x1e2/0x280 __sys_bpf+0x75f/0xac0 __x64_sys_bpf+0x1a/0x30 do_syscall_64+0x56/0x100 entry_SYSCALL_64_after_hwframe+0x78/0xe2 BUG: Bad rss-counter state mm:00000000452453e0 type:MM_FILEPAGES val:-1 The following syzkaller test case can be used to reproduce: r2 = creat(&(0x7f0000000000)='./file0\x00', 0x8) write$nbd(r2, &(0x7f0000000580)=ANY=[], 0x10) r4 = openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x42, 0x0) mmap$IORING_OFF_SQ_RING(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x0, 0x12, r4, 0x0) r5 = userfaultfd(0x80801) ioctl$UFFDIO_API(r5, 0xc018aa3f, &(0x7f0000000040)={0xaa, 0x20}) r6 = userfaultfd(0x80801) ioctl$UFFDIO_API(r6, 0xc018aa3f, &(0x7f0000000140)) ioctl$UFFDIO_REGISTER(r6, 0xc020aa00, &(0x7f0000000100)={{&(0x7f0000ffc000/0x4000)=nil, 0x4000}, 0x2}) ioctl$UFFDIO_ZEROPAGE(r5, 0xc020aa04, &(0x7f0000000000)={{&(0x7f0000ffd000/0x1000)=nil, 0x1000}}) r7 = bpf$PROG_LOAD(0x5, &(0x7f0000000140)={0x2, 0x3, &(0x7f0000000200)=ANY=[@ANYBLOB="1800000000120000000000000000000095"], &(0x7f0000000000)='GPL\x00', 0x7, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback=0x30, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, @void, @value}, 0x94) bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f0000000040)={r7, 0x0, 0x30, 0x1e, @val=@uprobe_multi={&(0x7f0000000080)='./file0\x00', &(0x7f0000000100)=[0x2], 0x0, 0x0, 0x1}}, 0x40) The cause is that zero pfn is set to the PTE without increasing the RSS count in mfill_atomic_pte_zeropage() and the refcount of zero folio does not increase accordingly. Then, the operation on the same pfn is performed in uprobe_write_opcode()->__replace_page() to unconditional decrease the RSS count and old_folio's refcount. Therefore, two bugs are introduced: 1. The RSS count is incorrect, when process exit, the check_mm() report error "Bad rss-count". 2. The reserved folio (zero folio) is freed when folio->refcount is zero, then free_pages_prepare->free_page_is_bad() report error "Bad page state". There is more, the following warning could also theoretically be triggered: __replace_page() -> ... -> folio_remove_rmap_pte() -> VM_WARN_ON_FOLIO(is_zero_folio(folio), folio) Considering that uprobe hit on the zero folio is a very rare case, just reject zero old folio immediately after get_user_page_vma_remote(). [ mingo: Cleaned up the changelog ] Fixes: 7396fa818d62 ("uprobes/core: Make background page replacement logic account for rss_stat counters") Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints") Signed-off-by: Tong Tiangen Signed-off-by: Ingo Molnar Reviewed-by: David Hildenbrand Reviewed-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Masami Hiramatsu Link: https://lore.kernel.org/r/20250224031149.1598949-1-tongtiangen@huawei.com --- kernel/events/uprobes.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bf2a87a0a3787..af53fbd2d12c4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -495,6 +495,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret <= 0) goto put_old; + if (is_zero_page(old_page)) { + ret = -EINVAL; + goto put_old; + } + if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; -- GitLab From 815291c11acda54515f1af5ce6fe307490de9127 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Feb 2025 08:28:59 -0800 Subject: [PATCH 861/989] configfs: update MAINTAINERS Joel will go back to maintain configfs alone on a time permitting basis. Signed-off-by: Christoph Hellwig Acked-by: Joel Becker Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4ff26fa94895d..089c1178f25a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5856,7 +5856,6 @@ F: Documentation/security/snp-tdx-threat-model.rst CONFIGFS M: Joel Becker -M: Christoph Hellwig S: Supported T: git git://git.infradead.org/users/hch/configfs.git F: fs/configfs/ -- GitLab From f7d5db965f3e132887779c6b449452db2b807caa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Feb 2025 08:27:21 -0800 Subject: [PATCH 862/989] dma-mapping: update MAINTAINERS Marek has graciously offered to maintain the dma-mapping tree. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 089c1178f25a5..a78eaaa24a699 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6878,7 +6878,6 @@ F: kernel/dma/map_benchmark.c F: tools/testing/selftests/dma/ DMA MAPPING HELPERS -M: Christoph Hellwig M: Marek Szyprowski R: Robin Murphy L: iommu@lists.linux.dev -- GitLab From e043dc16c28c8446e66c55adfe7c6e862a6a7bb7 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 21 Feb 2025 14:38:41 +0000 Subject: [PATCH 863/989] drm/xe/userptr: restore invalidation list on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On error restore anything still on the pin_list back to the invalidation list on error. For the actual pin, so long as the vma is tracked on either list it should get picked up on the next pin, however it looks possible for the vma to get nuked but still be present on this per vm pin_list leading to corruption. An alternative might be then to instead just remove the link when destroying the vma. v2: - Also add some asserts. - Keep the overzealous locking so that we are consistent with the docs; updating the docs and related bits will be done as a follow up. Fixes: ed2bdf3b264d ("drm/xe/vm: Subclass userptr vmas") Suggested-by: Matthew Brost Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: # v6.8+ Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250221143840.167150-4-matthew.auld@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 4e37e928928b730de9aa9a2f5dc853feeebc1742) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 690330352d4cd..47f7d8f2094b6 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -666,15 +666,16 @@ int xe_vm_userptr_pin(struct xe_vm *vm) /* Collect invalidated userptrs */ spin_lock(&vm->userptr.invalidated_lock); + xe_assert(vm->xe, list_empty(&vm->userptr.repin_list)); list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated, userptr.invalidate_link) { list_del_init(&uvma->userptr.invalidate_link); - list_move_tail(&uvma->userptr.repin_link, - &vm->userptr.repin_list); + list_add_tail(&uvma->userptr.repin_link, + &vm->userptr.repin_list); } spin_unlock(&vm->userptr.invalidated_lock); - /* Pin and move to temporary list */ + /* Pin and move to bind list */ list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list, userptr.repin_link) { err = xe_vma_userptr_pin_pages(uvma); @@ -690,10 +691,10 @@ int xe_vm_userptr_pin(struct xe_vm *vm) err = xe_vm_invalidate_vma(&uvma->vma); xe_vm_unlock(vm); if (err) - return err; + break; } else { - if (err < 0) - return err; + if (err) + break; list_del_init(&uvma->userptr.repin_link); list_move_tail(&uvma->vma.combined_links.rebind, @@ -701,7 +702,19 @@ int xe_vm_userptr_pin(struct xe_vm *vm) } } - return 0; + if (err) { + down_write(&vm->userptr.notifier_lock); + spin_lock(&vm->userptr.invalidated_lock); + list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list, + userptr.repin_link) { + list_del_init(&uvma->userptr.repin_link); + list_move_tail(&uvma->userptr.invalidate_link, + &vm->userptr.invalidated); + } + spin_unlock(&vm->userptr.invalidated_lock); + up_write(&vm->userptr.notifier_lock); + } + return err; } /** @@ -1066,6 +1079,7 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED); spin_lock(&vm->userptr.invalidated_lock); + xe_assert(vm->xe, list_empty(&to_userptr_vma(vma)->userptr.repin_link)); list_del(&to_userptr_vma(vma)->userptr.invalidate_link); spin_unlock(&vm->userptr.invalidated_lock); } else if (!xe_vma_is_null(vma)) { -- GitLab From a9f4fa3a7efa65615ff7db13023ac84516e99e21 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 21 Feb 2025 14:38:42 +0000 Subject: [PATCH 864/989] drm/xe/userptr: fix EFAULT handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we treat EFAULT from hmm_range_fault() as a non-fatal error when called from xe_vm_userptr_pin() with the idea that we want to avoid killing the entire vm and chucking an error, under the assumption that the user just did an unmap or something, and has no intention of actually touching that memory from the GPU. At this point we have already zapped the PTEs so any access should generate a page fault, and if the pin fails there also it will then become fatal. However it looks like it's possible for the userptr vma to still be on the rebind list in preempt_rebind_work_func(), if we had to retry the pin again due to something happening in the caller before we did the rebind step, but in the meantime needing to re-validate the userptr and this time hitting the EFAULT. This explains an internal user report of hitting: [ 191.738349] WARNING: CPU: 1 PID: 157 at drivers/gpu/drm/xe/xe_res_cursor.h:158 xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe] [ 191.738551] Workqueue: xe-ordered-wq preempt_rebind_work_func [xe] [ 191.738616] RIP: 0010:xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe] [ 191.738690] Call Trace: [ 191.738692] [ 191.738694] ? show_regs+0x69/0x80 [ 191.738698] ? __warn+0x93/0x1a0 [ 191.738703] ? xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe] [ 191.738759] ? report_bug+0x18f/0x1a0 [ 191.738764] ? handle_bug+0x63/0xa0 [ 191.738767] ? exc_invalid_op+0x19/0x70 [ 191.738770] ? asm_exc_invalid_op+0x1b/0x20 [ 191.738777] ? xe_pt_stage_bind.constprop.0+0x60a/0x6b0 [xe] [ 191.738834] ? ret_from_fork_asm+0x1a/0x30 [ 191.738849] bind_op_prepare+0x105/0x7b0 [xe] [ 191.738906] ? dma_resv_reserve_fences+0x301/0x380 [ 191.738912] xe_pt_update_ops_prepare+0x28c/0x4b0 [xe] [ 191.738966] ? kmemleak_alloc+0x4b/0x80 [ 191.738973] ops_execute+0x188/0x9d0 [xe] [ 191.739036] xe_vm_rebind+0x4ce/0x5a0 [xe] [ 191.739098] ? trace_hardirqs_on+0x4d/0x60 [ 191.739112] preempt_rebind_work_func+0x76f/0xd00 [xe] Followed by NPD, when running some workload, since the sg was never actually populated but the vma is still marked for rebind when it should be skipped for this special EFAULT case. This is confirmed to fix the user report. v2 (MattB): - Move earlier. v3 (MattB): - Update the commit message to make it clear that this indeed fixes the issue. Fixes: 521db22a1d70 ("drm/xe: Invalidate userptr VMA on page pin fault") Signed-off-by: Matthew Auld Cc: Matthew Brost Cc: Thomas Hellström Cc: # v6.10+ Reviewed-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20250221143840.167150-5-matthew.auld@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 6b93cb98910c826c2e2004942f8b060311e43618) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 47f7d8f2094b6..30259eba450b5 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -681,6 +681,18 @@ int xe_vm_userptr_pin(struct xe_vm *vm) err = xe_vma_userptr_pin_pages(uvma); if (err == -EFAULT) { list_del_init(&uvma->userptr.repin_link); + /* + * We might have already done the pin once already, but + * then had to retry before the re-bind happened, due + * some other condition in the caller, but in the + * meantime the userptr got dinged by the notifier such + * that we need to revalidate here, but this time we hit + * the EFAULT. In such a case make sure we remove + * ourselves from the rebind list to avoid going down in + * flames. + */ + if (!list_empty(&uvma->vma.combined_links.rebind)) + list_del_init(&uvma->vma.combined_links.rebind); /* Wait for pending binds */ xe_vm_lock(vm, false); -- GitLab From db10fde5c4f96231e1d2bbfd01feb5f2f59b96d1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 20 Feb 2025 18:51:40 -0800 Subject: [PATCH 865/989] net: ethtool: fix ioctl confusing drivers about desired HDS user config The legacy ioctl path does not have support for extended attributes. So we issue a GET to fetch the current settings from the driver, in an attempt to keep them unchanged. HDS is a bit "special" as the GET only returns on/off while the SET takes a "ternary" argument (on/off/default). If the driver was in the "default" setting - executing the ioctl path binds it to on or off, even tho the user did not intend to change HDS config. Factor the relevant logic out of the netlink code and reuse it. Fixes: 87c8f8496a05 ("bnxt_en: add support for tcp-data-split ethtool command") Acked-by: Stanislav Fomichev Tested-by: Daniel Xu Tested-by: Taehee Yoo Link: https://patch.msgid.link/20250221025141.1132944-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 16 ++++++++++++++++ net/ethtool/common.h | 6 ++++++ net/ethtool/ioctl.c | 4 ++-- net/ethtool/rings.c | 9 ++++----- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index d88e9080643b8..b97374b508f67 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "netlink.h" #include "common.h" @@ -771,6 +772,21 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack) +{ + memset(param, 0, sizeof(*param)); + memset(kparam, 0, sizeof(*kparam)); + + param->cmd = ETHTOOL_GRINGPARAM; + dev->ethtool_ops->get_ringparam(dev, param, kparam, extack); + + /* Driver gives us current state, we want to return current config */ + kparam->tcp_data_split = dev->cfg->hds_config; +} + static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) { memset(info, 0, sizeof(*info)); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 58e9e7db06f90..a1088c2441d0a 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -51,6 +51,12 @@ int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); + +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7609ce2b2c5e2..1c3ba2247776b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2059,8 +2059,8 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; + struct ethtool_ringparam ringparam, max; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) @@ -2069,7 +2069,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; - dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); + ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 7839bfd1ac6a0..aeedd5ec6b8cd 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -215,17 +215,16 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; + struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; + struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; - dev->ethtool_ops->get_ringparam(dev, &ringparam, - &kernel_ringparam, info->extack); - kernel_ringparam.tcp_data_split = dev->cfg->hds_config; + ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, + info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, -- GitLab From 29b036be1b0bfcfc958380d5931325997fddf08a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 20 Feb 2025 18:51:41 -0800 Subject: [PATCH 866/989] selftests: drv-net: test XDP, HDS auto and the ioctl path Test XDP and HDS interaction. While at it add a test for using the IOCTL, as that turned out to be the real culprit. Testing bnxt: # NETIF=eth0 ./ksft-net-drv/drivers/net/hds.py KTAP version 1 1..12 ok 1 hds.get_hds ok 2 hds.get_hds_thresh ok 3 hds.set_hds_disable # SKIP disabling of HDS not supported by the device ok 4 hds.set_hds_enable ok 5 hds.set_hds_thresh_zero ok 6 hds.set_hds_thresh_max ok 7 hds.set_hds_thresh_gt ok 8 hds.set_xdp ok 9 hds.enabled_set_xdp ok 10 hds.ioctl ok 11 hds.ioctl_set_xdp ok 12 hds.ioctl_enabled_set_xdp # Totals: pass:11 fail:0 xfail:0 xpass:0 skip:1 error:0 and netdevsim: # ./ksft-net-drv/drivers/net/hds.py KTAP version 1 1..12 ok 1 hds.get_hds ok 2 hds.get_hds_thresh ok 3 hds.set_hds_disable ok 4 hds.set_hds_enable ok 5 hds.set_hds_thresh_zero ok 6 hds.set_hds_thresh_max ok 7 hds.set_hds_thresh_gt ok 8 hds.set_xdp ok 9 hds.enabled_set_xdp ok 10 hds.ioctl ok 11 hds.ioctl_set_xdp ok 12 hds.ioctl_enabled_set_xdp # Totals: pass:12 fail:0 xfail:0 xpass:0 skip:0 error:0 Netdevsim needs a sane default for tx/rx ring size. ethtool 6.11 is needed for the --disable-netlink option. Acked-by: Stanislav Fomichev Tested-by: Taehee Yoo Link: https://patch.msgid.link/20250221025141.1132944-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/ethtool.c | 2 + tools/testing/selftests/drivers/net/hds.py | 145 +++++++++++++++++- tools/testing/selftests/net/lib/Makefile | 3 + .../testing/selftests/net/lib/xdp_dummy.bpf.c | 13 ++ 4 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/net/lib/xdp_dummy.bpf.c diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 5c80fbee79138..7ab358616e035 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -184,9 +184,11 @@ static const struct ethtool_ops nsim_ethtool_ops = { static void nsim_ethtool_ring_init(struct netdevsim *ns) { + ns->ethtool.ring.rx_pending = 512; ns->ethtool.ring.rx_max_pending = 4096; ns->ethtool.ring.rx_jumbo_max_pending = 4096; ns->ethtool.ring.rx_mini_max_pending = 4096; + ns->ethtool.ring.tx_pending = 512; ns->ethtool.ring.tx_max_pending = 4096; } diff --git a/tools/testing/selftests/drivers/net/hds.py b/tools/testing/selftests/drivers/net/hds.py index 394971b25c0b1..873f5219e41d7 100755 --- a/tools/testing/selftests/drivers/net/hds.py +++ b/tools/testing/selftests/drivers/net/hds.py @@ -2,17 +2,54 @@ # SPDX-License-Identifier: GPL-2.0 import errno +import os from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_raises, KsftSkipEx -from lib.py import EthtoolFamily, NlError +from lib.py import CmdExitFailure, EthtoolFamily, NlError from lib.py import NetDrvEnv +from lib.py import defer, ethtool, ip -def get_hds(cfg, netnl) -> None: + +def _get_hds_mode(cfg, netnl) -> str: try: rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) except NlError as e: raise KsftSkipEx('ring-get not supported by device') if 'tcp-data-split' not in rings: raise KsftSkipEx('tcp-data-split not supported by device') + return rings['tcp-data-split'] + + +def _xdp_onoff(cfg): + test_dir = os.path.dirname(os.path.realpath(__file__)) + prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" + ip("link set dev %s xdp obj %s sec xdp" % + (cfg.ifname, prog)) + ip("link set dev %s xdp off" % cfg.ifname) + + +def _ioctl_ringparam_modify(cfg, netnl) -> None: + """ + Helper for performing a hopefully unimportant IOCTL SET. + IOCTL does not support HDS, so it should not affect the HDS config. + """ + try: + rings = netnl.rings_get({'header': {'dev-index': cfg.ifindex}}) + except NlError as e: + raise KsftSkipEx('ring-get not supported by device') + + if 'tx' not in rings: + raise KsftSkipEx('setting Tx ring size not supported') + + try: + ethtool(f"--disable-netlink -G {cfg.ifname} tx {rings['tx'] // 2}") + except CmdExitFailure as e: + ethtool(f"--disable-netlink -G {cfg.ifname} tx {rings['tx'] * 2}") + defer(ethtool, f"-G {cfg.ifname} tx {rings['tx']}") + + +def get_hds(cfg, netnl) -> None: + _get_hds_mode(cfg, netnl) + def get_hds_thresh(cfg, netnl) -> None: try: @@ -104,6 +141,103 @@ def set_hds_thresh_gt(cfg, netnl) -> None: netnl.rings_set({'header': {'dev-index': cfg.ifindex}, 'hds-thresh': hds_gt}) ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) + +def set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "auto" / UNKNOWN mode, XDP installation should work. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _xdp_onoff(cfg) + + +def enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + +def set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "auto" / UNKNOWN mode, XDP installation should work. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _xdp_onoff(cfg) + + +def enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) # Trigger skip if not supported + + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + +def ioctl(cfg, netnl) -> None: + mode1 = _get_hds_mode(cfg, netnl) + _ioctl_ringparam_modify(cfg, netnl) + mode2 = _get_hds_mode(cfg, netnl) + + ksft_eq(mode1, mode2) + + +def ioctl_set_xdp(cfg, netnl) -> None: + """ + Like set_xdp(), but we perturb the settings via the legacy ioctl. + """ + mode = _get_hds_mode(cfg, netnl) + if mode == 'enabled': + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + _ioctl_ringparam_modify(cfg, netnl) + + _xdp_onoff(cfg) + + +def ioctl_enabled_set_xdp(cfg, netnl) -> None: + """ + Enable single-buffer XDP on the device. + When HDS is in "enabled" mode, XDP installation should not work. + """ + _get_hds_mode(cfg, netnl) # Trigger skip if not supported + + netnl.rings_set({'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'enabled'}) + defer(netnl.rings_set, {'header': {'dev-index': cfg.ifindex}, + 'tcp-data-split': 'unknown'}) + + with ksft_raises(CmdExitFailure) as e: + _xdp_onoff(cfg) + + def main() -> None: with NetDrvEnv(__file__, queue_count=3) as cfg: ksft_run([get_hds, @@ -112,7 +246,12 @@ def main() -> None: set_hds_enable, set_hds_thresh_zero, set_hds_thresh_max, - set_hds_thresh_gt], + set_hds_thresh_gt, + set_xdp, + enabled_set_xdp, + ioctl, + ioctl_set_xdp, + ioctl_enabled_set_xdp], args=(cfg, EthtoolFamily())) ksft_exit() diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index bc6b6762baf3e..c22623b9a2a5f 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -9,7 +9,10 @@ TEST_FILES := ../../../../../Documentation/netlink/specs TEST_FILES += ../../../../net/ynl TEST_GEN_FILES += csum +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) TEST_INCLUDES := $(wildcard py/*.py sh/*.sh) include ../../lib.mk + +include ../bpf.mk diff --git a/tools/testing/selftests/net/lib/xdp_dummy.bpf.c b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c new file mode 100644 index 0000000000000..d988b2e0cee84 --- /dev/null +++ b/tools/testing/selftests/net/lib/xdp_dummy.bpf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define KBUILD_MODNAME "xdp_dummy" +#include +#include + +SEC("xdp") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; -- GitLab From b5799106b44e1df594f4696500dbbc3b326bba18 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 24 Feb 2025 15:45:38 +0000 Subject: [PATCH 867/989] iomap: Minor code simplification in iomap_dio_bio_iter() Combine 'else' and 'if' conditional statements onto a single line and drop unrequired braces, as is standard coding style. The code had been like this since commit c3b0e880bbfa ("iomap: support REQ_OP_ZONE_APPEND"). Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250224154538.548028-1-john.g.garry@oracle.com Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e8..0e47da82b0c24 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -427,12 +427,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, bio_put(bio); goto zero_tail; } - if (dio->flags & IOMAP_DIO_WRITE) { + if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(n); - } else { - if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - } + else if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); dio->size += n; copied += n; -- GitLab From 423de5b5bc5b267586b449abd1c4fde562aa0cf9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Feb 2025 17:57:11 +0100 Subject: [PATCH 868/989] thermal/of: Fix cdev lookup in thermal_of_should_bind() Since thermal_of_should_bind() terminates the loop after processing the first child found in cooling-maps, it will never match more than one cdev to a given trip point which is incorrect, as there may be cooling-maps associating one trip point with multiple cooling devices. Address this by letting the loop continue until either all children have been processed or a matching one has been found. To avoid adding conditionals or goto statements, put the loop in question into a separate function and make that function return right away after finding a matching cooling-maps entry. Fixes: 94c6110b0b13 ("thermal/of: Use the .should_bind() thermal zone callback") Link: https://lore.kernel.org/linux-pm/20250219-fix-thermal-of-v1-1-de36e7a590c4@chromium.org/ Reported-by: Yu-Che Cheng Signed-off-by: Rafael J. Wysocki Reviewed-by: Yu-Che Cheng Tested-by: Yu-Che Cheng Reviewed-by: Lukasz Luba Tested-by: Lukasz Luba Link: https://patch.msgid.link/2788228.mvXUDI8C0e@rjwysocki.net --- drivers/thermal/thermal_of.c | 50 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 5ab4ce4daaebd..5401f03d6b6c1 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -274,6 +274,34 @@ static bool thermal_of_get_cooling_spec(struct device_node *map_np, int index, return true; } +static bool thermal_of_cm_lookup(struct device_node *cm_np, + const struct thermal_trip *trip, + struct thermal_cooling_device *cdev, + struct cooling_spec *c) +{ + for_each_child_of_node_scoped(cm_np, child) { + struct device_node *tr_np; + int count, i; + + tr_np = of_parse_phandle(child, "trip", 0); + if (tr_np != trip->priv) + continue; + + /* The trip has been found, look up the cdev. */ + count = of_count_phandle_with_args(child, "cooling-device", + "#cooling-cells"); + if (count <= 0) + pr_err("Add a cooling_device property with at least one device\n"); + + for (i = 0; i < count; i++) { + if (thermal_of_get_cooling_spec(child, i, cdev, c)) + return true; + } + } + + return false; +} + static bool thermal_of_should_bind(struct thermal_zone_device *tz, const struct thermal_trip *trip, struct thermal_cooling_device *cdev, @@ -293,27 +321,7 @@ static bool thermal_of_should_bind(struct thermal_zone_device *tz, goto out; /* Look up the trip and the cdev in the cooling maps. */ - for_each_child_of_node_scoped(cm_np, child) { - struct device_node *tr_np; - int count, i; - - tr_np = of_parse_phandle(child, "trip", 0); - if (tr_np != trip->priv) - continue; - - /* The trip has been found, look up the cdev. */ - count = of_count_phandle_with_args(child, "cooling-device", "#cooling-cells"); - if (count <= 0) - pr_err("Add a cooling_device property with at least one device\n"); - - for (i = 0; i < count; i++) { - result = thermal_of_get_cooling_spec(child, i, cdev, c); - if (result) - break; - } - - break; - } + result = thermal_of_cm_lookup(cm_np, trip, cdev, c); of_node_put(cm_np); out: -- GitLab From 0cde378a10c1cbfaa8dd2b89672d42f36c2809c3 Mon Sep 17 00:00:00 2001 From: Yu-Che Cheng Date: Sat, 22 Feb 2025 11:20:34 +0800 Subject: [PATCH 869/989] thermal: gov_power_allocator: Update total_weight on bind and cdev updates params->total_weight is not initialized during bind and not updated when the bound cdev changes. The cooling device weight will not be used due to the uninitialized total_weight, until an update via sysfs is triggered. The bound cdevs are updated during thermal zone registration, where each cooling device will be bound to the thermal zone one by one, but power_allocator_bind() can be called without an additional cdev update when manually changing the policy of a thermal zone via sysfs. Add a new function to handle weight update logic, including updating total_weight, and call it when bind, weight changes, and cdev updates to ensure total_weight is always correct. Fixes: a3cd6db4cc2e ("thermal: gov_power_allocator: Support new update callback of weights") Signed-off-by: Yu-Che Cheng Link: https://patch.msgid.link/20250222-fix-power-allocator-weight-v2-1-a94de86b685a@chromium.org [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_power_allocator.c | 30 ++++++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 3b626db55b2b9..0d9f636c80f4d 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -641,6 +641,22 @@ static int allocate_actors_buffer(struct power_allocator_params *params, return ret; } +static void power_allocator_update_weight(struct power_allocator_params *params) +{ + const struct thermal_trip_desc *td; + struct thermal_instance *instance; + + if (!params->trip_max) + return; + + td = trip_to_trip_desc(params->trip_max); + + params->total_weight = 0; + list_for_each_entry(instance, &td->thermal_instances, trip_node) + if (power_actor_is_valid(instance)) + params->total_weight += instance->weight; +} + static void power_allocator_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { @@ -656,16 +672,12 @@ static void power_allocator_update_tz(struct thermal_zone_device *tz, if (power_actor_is_valid(instance)) num_actors++; - if (num_actors == params->num_actors) - return; + if (num_actors != params->num_actors) + allocate_actors_buffer(params, num_actors); - allocate_actors_buffer(params, num_actors); - break; + fallthrough; case THERMAL_INSTANCE_WEIGHT_CHANGED: - params->total_weight = 0; - list_for_each_entry(instance, &td->thermal_instances, trip_node) - if (power_actor_is_valid(instance)) - params->total_weight += instance->weight; + power_allocator_update_weight(params); break; default: break; @@ -731,6 +743,8 @@ static int power_allocator_bind(struct thermal_zone_device *tz) tz->governor_data = params; + power_allocator_update_weight(params); + return 0; free_params: -- GitLab From de2c211868b9424f9aa9b3432c4430825bafb41b Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Sat, 22 Feb 2025 11:35:18 +0800 Subject: [PATCH 870/989] ipvs: Always clear ipvs_property flag in skb_scrub_packet() We found an issue when using bpf_redirect with ipvs NAT mode after commit ff70202b2d1a ("dev_forward_skb: do not scrub skb mark within the same name space"). Particularly, we use bpf_redirect to return the skb directly back to the netif it comes from, i.e., xnet is false in skb_scrub_packet(), and then ipvs_property is preserved and SNAT is skipped in the rx path. ipvs_property has been already cleared when netns is changed in commit 2b5ec1a5f973 ("netfilter/ipvs: clear ipvs_property flag when SKB net namespace changed"). This patch just clears it in spite of netns. Fixes: 2b5ec1a5f973 ("netfilter/ipvs: clear ipvs_property flag when SKB net namespace changed") Signed-off-by: Philo Lu Acked-by: Julian Anastasov Link: https://patch.msgid.link/20250222033518.126087-1-lulie@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7b03b64fdcb27..b1c81687e9d82 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6033,11 +6033,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } -- GitLab From bc50682128bde778a1ddc457a02d92a637c20c6f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 22 Feb 2025 12:28:04 -0500 Subject: [PATCH 871/989] MAINTAINERS: socket timestamping: add Jason Xing as reviewer Jason has been helping as reviewer for this area already, and has contributed various features directly, notably BPF timestamping. Also extend coverage to all timestamping tests, including those new with BPF timestamping. Link: https://lore.kernel.org/netdev/20250220072940.99994-1-kerneljasonxing@gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250222172839.642079-1-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ac15093537c6b..c92bcd02049e6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21923,10 +21923,13 @@ F: sound/soc/uniphier/ SOCKET TIMESTAMPING M: Willem de Bruijn +R: Jason Xing S: Maintained F: Documentation/networking/timestamping.rst F: include/linux/net_tstamp.h F: include/uapi/linux/net_tstamp.h +F: tools/testing/selftests/bpf/*/net_timestamping* +F: tools/testing/selftests/net/*timestamp* F: tools/testing/selftests/net/so_txtime.c SOEKRIS NET48XX LED SUPPORT -- GitLab From 88ec7eedbbd21cad38707620ad6c48a4e9a87c18 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:11 -0800 Subject: [PATCH 872/989] perf/x86: Fix low freqency setting issue Perf doesn't work at low frequencies: $ perf record -e cpu_core/instructions/ppp -F 120 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_core/instructions/ppp). "dmesg | grep -i perf" may provide additional information. The limit_period() check avoids a low sampling period on a counter. It doesn't intend to limit the frequency. The check in the x86_pmu_hw_config() should be limited to non-freq mode. The attr.sample_period and attr.sample_freq are union. The attr.sample_period should not be used to indicate the frequency mode. Fixes: c46e665f0377 ("perf/x86: Add INST_RETIRED.ALL workarounds") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117151913.3043942-1-kan.liang@linux.intel.com Closes: https://lore.kernel.org/lkml/20250115154949.3147-1-ravi.bangoria@amd.com/ --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f218ac0d445c..2092d615333da 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -628,7 +628,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (event->attr.sample_period && x86_pmu.limit_period) { + if (!event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) -- GitLab From 0d39844150546fa1415127c5fbae26db64070dd3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:12 -0800 Subject: [PATCH 873/989] perf/core: Fix low freq setting via IOC_PERIOD A low attr::freq value cannot be set via IOC_PERIOD on some platforms. The perf_event_check_period() introduced in: 81ec3f3c4c4d ("perf/x86: Add check_period PMU callback") was intended to check the period, rather than the frequency. A low frequency may be mistakenly rejected by limit_period(). Fix it. Fixes: 81ec3f3c4c4d ("perf/x86: Add check_period PMU callback") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117151913.3043942-2-kan.liang@linux.intel.com Closes: https://lore.kernel.org/lkml/20250115154949.3147-1-ravi.bangoria@amd.com/ --- kernel/events/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 086d46d096963..6364319e2f888 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5969,14 +5969,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); -- GitLab From 5bd566703e16b17d17f4fb648440d54f8967462c Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 21 Feb 2025 13:33:52 -0800 Subject: [PATCH 874/989] drm/xe/oa: Allow oa_exponent value of 0 OA exponent value of 0 is a valid value for periodic reports. Allow user to pass 0 for the OA sampling interval since it gets converted to 2 gt clock ticks. v2: Update the check in xe_oa_stream_init as well (Ashutosh) v3: Fix mi-rpc failure by setting default exponent to -1 (CI) v4: Add the Fixes tag Fixes: b6fd51c62119 ("drm/xe/oa/uapi: Define and parse OA stream properties") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20250221213352.1712932-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 30341f0b8ea71725cc4ab2c43e3a3b749892fc92) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_oa.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index fa873f3d0a9d1..eb6cd91e1e226 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1689,7 +1689,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, stream->oa_buffer.format = &stream->oa->oa_formats[param->oa_format]; stream->sample = param->sample; - stream->periodic = param->period_exponent > 0; + stream->periodic = param->period_exponent >= 0; stream->period_exponent = param->period_exponent; stream->no_preempt = param->no_preempt; stream->wait_num_reports = param->wait_num_reports; @@ -1970,6 +1970,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f } param.xef = xef; + param.period_exponent = -1; ret = xe_oa_user_extensions(oa, XE_OA_USER_EXTN_FROM_OPEN, data, 0, ¶m); if (ret) return ret; @@ -2024,7 +2025,7 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, u64 data, struct drm_file *f goto err_exec_q; } - if (param.period_exponent > 0) { + if (param.period_exponent >= 0) { u64 oa_period, oa_freq_hz; /* Requesting samples from OAG buffer is a privileged operation */ -- GitLab From 8ec43c58d3be615a71548bc09148212013fb7e5f Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 19 Dec 2024 21:33:08 -0700 Subject: [PATCH 875/989] drm/vkms: Round fixp2int conversion in lerp_u16 fixp2int always rounds down, fixp2int_ceil rounds up. We need the new fixp2int_round. Signed-off-by: Alex Hung Signed-off-by: Harry Wentland Reviewed-by: Louis Chauvet Link: https://patchwork.freedesktop.org/patch/msgid/20241220043410.416867-3-alex.hung@amd.com Signed-off-by: Louis Chauvet --- drivers/gpu/drm/vkms/vkms_composer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c index b20ac17057262..fa269d279e257 100644 --- a/drivers/gpu/drm/vkms/vkms_composer.c +++ b/drivers/gpu/drm/vkms/vkms_composer.c @@ -67,7 +67,7 @@ static u16 lerp_u16(u16 a, u16 b, s64 t) s64 delta = drm_fixp_mul(b_fp - a_fp, t); - return drm_fixp2int(a_fp + delta); + return drm_fixp2int_round(a_fp + delta); } static s64 get_lut_index(const struct vkms_color_lut *lut, u16 channel_value) -- GitLab From 889c57066ceee5e9172232da0608a8ac053bb6e5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Feb 2025 10:21:41 +0800 Subject: [PATCH 876/989] block: make segment size limit workable for > 4K PAGE_SIZE Using PAGE_SIZE as a minimum expected DMA segment size in consideration of devices which have a max DMA segment size of < 64k when used on 64k PAGE_SIZE systems leads to devices not being able to probe such as eMMC and Exynos UFS controller [0] [1] you can end up with a probe failure as follows: WARNING: CPU: 2 PID: 397 at block/blk-settings.c:339 blk_validate_limits+0x364/0x3c0 Ensure we use min(max_seg_size, seg_boundary_mask + 1) as the new min segment size when max segment size is < PAGE_SIZE for 16k and 64k base page size systems. If anyone need to backport this patch, the following commits are depended: commit 6aeb4f836480 ("block: remove bio_add_pc_page") commit 02ee5d69e3ba ("block: remove blk_rq_bio_prep") commit b7175e24d6ac ("block: add a dma mapping iterator") Link: https://lore.kernel.org/linux-block/20230612203314.17820-1-bvanassche@acm.org/ # [0] Link: https://lore.kernel.org/linux-block/1d55e942-5150-de4c-3a02-c3d066f87028@acm.org/ # [1] Cc: Yi Zhang Cc: John Garry Cc: Keith Busch Tested-by: Paul Bunyan Reviewed-by: Daniel Gomez Reviewed-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250225022141.2154581-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-settings.c | 14 +++++++++++--- block/blk.h | 9 +++++++-- include/linux/blkdev.h | 1 + 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index c7c85e10cf9cb..1d1589c352976 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -329,7 +329,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + bv.bv_offset + bv.bv_len <= lim->min_segment_size) { nsegs++; bytes += bv.bv_len; } else { diff --git a/block/blk-settings.c b/block/blk-settings.c index c44dadc35e1ec..b9c6f0ec1c499 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -246,6 +246,7 @@ int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + unsigned long seg_size; int err; /* @@ -303,7 +304,7 @@ int blk_validate_limits(struct queue_limits *lim) max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { - if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { @@ -341,7 +342,7 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* @@ -362,10 +363,17 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } + /* setup min segment size for building new segment in fast path */ + if (lim->seg_boundary_mask > lim->max_segment_size - 1) + seg_size = lim->max_segment_size; + else + seg_size = lim->seg_boundary_mask + 1; + lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccabf..9cf9a0099416d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,6 +14,7 @@ struct elevator_type; #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) +#define BLK_MIN_SEGMENT_SIZE 4096 /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -358,8 +359,12 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; + if (lim->chunk_sectors) + return true; + if (bio->bi_vcnt != 1) + return true; + return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > + lim->min_segment_size; } /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c9..58ff5aca83b67 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,6 +367,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- GitLab From 6ebf05189dfc6d0d597c99a6448a4d1064439a18 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 25 Feb 2025 15:59:02 +0000 Subject: [PATCH 877/989] io_uring/net: save msg_control for compat Match the compat part of io_sendmsg_copy_hdr() with its counterpart and save msg_control. Fixes: c55978024d123 ("io_uring/net: move receive multishot out of the generic msghdr path") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2a8418821fe83d3b64350ad2b3c0303e9b732bbd.1740498502.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 17852a6616ffe..5d0b56ff50eed 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -322,7 +322,9 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, if (unlikely(ret)) return ret; - return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + sr->msg_control = iomsg->msg.msg_control_user; + return ret; } #endif -- GitLab From 91dcc66b34beb72dde8412421bdc1b4cd40e4fb8 Mon Sep 17 00:00:00 2001 From: "chr[]" Date: Wed, 12 Feb 2025 16:51:38 +0100 Subject: [PATCH 878/989] amdgpu/pm/legacy: fix suspend/resume issues resume and irq handler happily races in set_power_state() * amdgpu_legacy_dpm_compute_clocks() needs lock * protect irq work handler * fix dpm_enabled usage v2: fix clang build, integrate Lijo's comments (Alex) Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2524 Fixes: 3712e7a49459 ("drm/amd/pm: unified lock protections in amdgpu_dpm.c") Reviewed-by: Lijo Lazar Tested-by: Maciej S. Szmigiero # on Oland PRO Signed-off-by: chr[] Signed-off-by: Alex Deucher (cherry picked from commit ee3dc9e204d271c9c7a8d4d38a0bce4745d33e71) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 25 +++++++++++++----- .../gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c | 8 ++++-- drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 26 ++++++++++++++----- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index 67a8e22b1126d..e237ea1185a71 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -3042,6 +3042,7 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); if (ret) @@ -3049,6 +3050,8 @@ static int kv_dpm_hw_init(struct amdgpu_ip_block *ip_block) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); + return ret; } @@ -3066,32 +3069,42 @@ static int kv_dpm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + cancel_work_sync(&adev->pm.dpm.thermal.work); + if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ kv_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } return 0; } static int kv_dpm_resume(struct amdgpu_ip_block *ip_block) { - int ret; + int ret = 0; struct amdgpu_device *adev = ip_block->adev; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); /* asic init will reset to the boot state */ kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); - if (ret) + if (ret) { adev->pm.dpm_enabled = false; - else + } else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + return ret; } static bool kv_dpm_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c index e861355ebd75b..c7518b13e7879 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c @@ -1009,9 +1009,12 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) enum amd_pm_state_type dpm_state = POWER_STATE_TYPE_INTERNAL_THERMAL; int temp, size = sizeof(temp); - if (!adev->pm.dpm_enabled) - return; + mutex_lock(&adev->pm.mutex); + if (!adev->pm.dpm_enabled) { + mutex_unlock(&adev->pm.mutex); + return; + } if (!pp_funcs->read_sensor(adev->powerplay.pp_handle, AMDGPU_PP_SENSOR_GPU_TEMP, (void *)&temp, @@ -1033,4 +1036,5 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) adev->pm.dpm.state = dpm_state; amdgpu_legacy_dpm_compute_clocks(adev->powerplay.pp_handle); + mutex_unlock(&adev->pm.mutex); } diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c index a87dcf0974bc1..d6dfe2599ebea 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c @@ -7786,6 +7786,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); if (ret) @@ -7793,6 +7794,7 @@ static int si_dpm_hw_init(struct amdgpu_ip_block *ip_block) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); return ret; } @@ -7810,32 +7812,44 @@ static int si_dpm_suspend(struct amdgpu_ip_block *ip_block) { struct amdgpu_device *adev = ip_block->adev; + cancel_work_sync(&adev->pm.dpm.thermal.work); + if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ si_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } + return 0; } static int si_dpm_resume(struct amdgpu_ip_block *ip_block) { - int ret; + int ret = 0; struct amdgpu_device *adev = ip_block->adev; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { /* asic init will reset to the boot state */ + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); - if (ret) + if (ret) { adev->pm.dpm_enabled = false; - else + } else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + + return ret; } static bool si_dpm_is_idle(void *handle) -- GitLab From 3502ab5022bb5ef1edd063bdb6465a8bf3b46e66 Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Wed, 19 Feb 2025 17:34:38 -0500 Subject: [PATCH 879/989] drm/amdkfd: Preserve cp_hqd_pq_control on update_mqd When userspace applications call AMDKFD_IOC_UPDATE_QUEUE. Preserve bitfields that do not need to be modified as they contain flags to track queue states that are used by CP FW. Signed-off-by: David Yat Sin Reviewed-by: Jay Cornwall Signed-off-by: Alex Deucher (cherry picked from commit 8150827990b709ab5a40c46c30d21b7f7b9e9440) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 6 ++++-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c | 5 +++-- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 5 ++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index 2eff37aaf8273..1695dd78ede8e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -107,6 +107,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -167,10 +169,10 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index 68dbc0399c87a..3c0ae28c5923b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -154,6 +154,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -221,10 +223,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c index 2b72d5b4949b6..565858b9044d4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c @@ -121,6 +121,8 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -184,10 +186,9 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; - m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index ff417d5361c42..3014925d95ffc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -183,6 +183,9 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__UNORD_DISPATCH_MASK; + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; m->cp_mqd_base_addr_lo = lower_32_bits(addr); @@ -245,7 +248,7 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, m = get_mqd(mqd); - m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control &= ~CP_HQD_PQ_CONTROL__QUEUE_SIZE_MASK; m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); -- GitLab From 099bffc7cadff40bfab1517c3461c53a7a38a0d7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 17 Feb 2025 10:55:05 -0500 Subject: [PATCH 880/989] drm/amdgpu: disable BAR resize on Dell G5 SE There was a quirk added to add a workaround for a Sapphire RX 5600 XT Pulse that didn't allow BAR resizing. However, the quirk caused a regression with runtime pm on Dell laptops using those chips, rather than narrowing the scope of the resizing quirk, add a quirk to prevent amdgpu from resizing the BAR on those Dell platforms unless runtime pm is disabled. v2: update commit message, add runpm check Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/1707 Fixes: 907830b0fc9e ("PCI: Add a REBAR size quirk for Sapphire RX 5600 XT Pulse") Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher (cherry picked from commit 5235053f443cef4210606e5fb71f99b915a9723d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d100bb7a137cd..018dfccd771ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1638,6 +1638,13 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + /* resizing on Dell G5 SE platforms causes problems with runtime pm */ + if ((amdgpu_runtime_pm != 0) && + adev->pdev->vendor == PCI_VENDOR_ID_ATI && + adev->pdev->device == 0x731f && + adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) + return 0; + /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) DRM_WARN("System can't access extended configuration space, please check!!\n"); -- GitLab From 8005351c7d53c31fb7eb5a423da7ab4bc3ad7639 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 11 Feb 2025 15:38:20 -0500 Subject: [PATCH 881/989] MAINTAINERS: update amdgpu maintainers list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xinhui's email is no longer valid. Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit c19390ca9094dfcbc16d96b233a409c01e21d85b) Cc: stable@vger.kernel.org --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4ff26fa94895d..d6ea828345fdb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19657,7 +19657,6 @@ F: drivers/net/wireless/quantenna RADEON and AMDGPU DRM DRIVERS M: Alex Deucher M: Christian König -M: Xinhui Pan L: amd-gfx@lists.freedesktop.org S: Supported B: https://gitlab.freedesktop.org/drm/amd/-/issues -- GitLab From e7ea88207cef513514e706aacc534527ac88b9b8 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 13 Feb 2025 13:37:01 -0500 Subject: [PATCH 882/989] drm/amdgpu/gfx: only call mes for enforce isolation if supported This should not be called on chips without MES so check if MES is enabled and if the cleaner shader is supported. Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES") Reviewed-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher Cc: Shaoyun Liu Cc: Srinivasan Shanmugam (cherry picked from commit 80513e389765c8f9543b26d8fa4bbdf0e59ff8bc) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 784b03abb3a43..c6aff3ddb42d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1643,11 +1643,13 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, if (adev->enforce_isolation[i] && !partition_values[i]) { /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, false); + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) + amdgpu_mes_set_enforce_isolation(adev, i, false); } else if (!adev->enforce_isolation[i] && partition_values[i]) { /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); - amdgpu_mes_set_enforce_isolation(adev, i, true); + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) + amdgpu_mes_set_enforce_isolation(adev, i, true); } adev->enforce_isolation[i] = partition_values[i]; } -- GitLab From 748a1f51bb74453f1fe22d3ca68a717cb31f02e5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 14 Feb 2025 12:32:30 -0500 Subject: [PATCH 883/989] drm/amdgpu/mes: keep enforce isolation up to date Re-send the mes message on resume to make sure the mes state is up to date. Fixes: 8521e3c5f058 ("drm/amd/amdgpu: limit single process inside MES") Acked-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher Cc: Shaoyun Liu Cc: Srinivasan Shanmugam Signed-off-by: Alex Deucher (cherry picked from commit 27b791514789844e80da990c456c2465325e0851) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 13 ++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 20 +++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 4 ++++ 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index c6aff3ddb42d7..c1f35ded684e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1638,24 +1638,19 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, } mutex_lock(&adev->enforce_isolation_mutex); - for (i = 0; i < num_partitions; i++) { - if (adev->enforce_isolation[i] && !partition_values[i]) { + if (adev->enforce_isolation[i] && !partition_values[i]) /* Going from enabled to disabled */ amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(i)); - if (adev->enable_mes && adev->gfx.enable_cleaner_shader) - amdgpu_mes_set_enforce_isolation(adev, i, false); - } else if (!adev->enforce_isolation[i] && partition_values[i]) { + else if (!adev->enforce_isolation[i] && partition_values[i]) /* Going from disabled to enabled */ amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); - if (adev->enable_mes && adev->gfx.enable_cleaner_shader) - amdgpu_mes_set_enforce_isolation(adev, i, true); - } adev->enforce_isolation[i] = partition_values[i]; } - mutex_unlock(&adev->enforce_isolation_mutex); + amdgpu_mes_update_enforce_isolation(adev); + return count; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 32b27a1658e78..709c11cbeabd8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1681,7 +1681,8 @@ bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev) } /* Fix me -- node_id is used to identify the correct MES instances in the future */ -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable) +static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, + uint32_t node_id, bool enable) { struct mes_misc_op_input op_input = {0}; int r; @@ -1703,6 +1704,23 @@ int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_i return r; } +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev) +{ + int i, r = 0; + + if (adev->enable_mes && adev->gfx.enable_cleaner_shader) { + mutex_lock(&adev->enforce_isolation_mutex); + for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) { + if (adev->enforce_isolation[i]) + r |= amdgpu_mes_set_enforce_isolation(adev, i, true); + else + r |= amdgpu_mes_set_enforce_isolation(adev, i, false); + } + mutex_unlock(&adev->enforce_isolation_mutex); + } + return r; +} + #if defined(CONFIG_DEBUG_FS) static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 2df2444ee892c..e98ea7ede1bab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -534,6 +534,6 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes *mes) bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); -int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, uint32_t node_id, bool enable); +int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); #endif /* __AMDGPU_MES_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 65f389eb65e5f..f9a4d08eef925 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1633,6 +1633,10 @@ static int mes_v11_0_hw_init(struct amdgpu_ip_block *ip_block) goto failure; } + r = amdgpu_mes_update_enforce_isolation(adev); + if (r) + goto failure; + out: /* * Disable KIQ ring usage from the driver once MES is enabled. diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 901e924e69ad9..0fd0fa6ed5184 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1743,6 +1743,10 @@ static int mes_v12_0_hw_init(struct amdgpu_ip_block *ip_block) goto failure; } + r = amdgpu_mes_update_enforce_isolation(adev); + if (r) + goto failure; + out: /* * Disable KIQ ring usage from the driver once MES is enabled. -- GitLab From 733d675c2a436b416107893db87eb182585c1b39 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Wed, 19 Feb 2025 11:46:19 -0700 Subject: [PATCH 884/989] MAINTAINERS: Change my role from Maintainer to Reviewer Reviewed-by: Harry Wentland Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 9b3ef540397cfc356f10f504841b2e9d16e31286) Cc: stable@vger.kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d6ea828345fdb..f3fdc43bdd497 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1046,7 +1046,7 @@ F: drivers/crypto/ccp/hsti.* AMD DISPLAY CORE M: Harry Wentland M: Leo Li -M: Rodrigo Siqueira +R: Rodrigo Siqueira L: amd-gfx@lists.freedesktop.org S: Supported T: git https://gitlab.freedesktop.org/agd5f/linux.git -- GitLab From 96989f3dca6f51f202b6dbc92c37e17df6ca12f4 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Wed, 19 Feb 2025 11:46:20 -0700 Subject: [PATCH 885/989] mailmap: Add entry for Rodrigo Siqueira Map all of my previously used email addresses to my @igalia.com address. Acked-by: Harry Wentland Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher (cherry picked from commit 289387d0dbf806bd59063ab93d94f48cd4c75c7c) Cc: stable@vger.kernel.org --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index a897c16d3baef..ec18b01f0be2a 100644 --- a/.mailmap +++ b/.mailmap @@ -613,6 +613,8 @@ Richard Leitner Richard Leitner Robert Foss Rocky Liao +Rodrigo Siqueira +Rodrigo Siqueira Roman Gushchin Roman Gushchin Roman Gushchin -- GitLab From 12f3b92d1cfa5526715fff93a6d6fe29300d5e2a Mon Sep 17 00:00:00 2001 From: Melissa Wen Date: Sat, 15 Feb 2025 18:15:47 -0300 Subject: [PATCH 886/989] drm/amd/display: restore edid reading from a given i2c adapter When switching to drm_edid, we slightly changed how to get edid by removing the possibility of getting them from dc_link when in aux transaction mode. As MST doesn't initialize the connector with `drm_connector_init_with_ddc()`, restore the original behavior to avoid functional changes. v2: - Fix build warning of unchecked dereference (kernel test bot) CC: Alex Hung CC: Mario Limonciello CC: Roman Li CC: Aurabindo Pillai Fixes: 48edb2a4256e ("drm/amd/display: switch amdgpu_dm_connector to use struct drm_edid") Reviewed-by: Alex Hung Signed-off-by: Melissa Wen Signed-off-by: Alex Deucher (cherry picked from commit 81262b1656feb3813e3d917ab78824df6831e69e) --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index ac3fd81fecef2..5ddd21466e22f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7240,8 +7240,14 @@ static void amdgpu_dm_connector_funcs_force(struct drm_connector *connector) struct dc_link *dc_link = aconnector->dc_link; struct dc_sink *dc_em_sink = aconnector->dc_em_sink; const struct drm_edid *drm_edid; + struct i2c_adapter *ddc; - drm_edid = drm_edid_read(connector); + if (dc_link && dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; + + drm_edid = drm_edid_read_ddc(connector, ddc); drm_edid_connector_update(connector, drm_edid); if (!drm_edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); @@ -7286,14 +7292,21 @@ static int get_modes(struct drm_connector *connector) static void create_eml_sink(struct amdgpu_dm_connector *aconnector) { struct drm_connector *connector = &aconnector->base; + struct dc_link *dc_link = aconnector->dc_link; struct dc_sink_init_data init_params = { .link = aconnector->dc_link, .sink_signal = SIGNAL_TYPE_VIRTUAL }; const struct drm_edid *drm_edid; const struct edid *edid; + struct i2c_adapter *ddc; + + if (dc_link && dc_link->aux_mode) + ddc = &aconnector->dm_dp_aux.aux.ddc; + else + ddc = &aconnector->i2c->base; - drm_edid = drm_edid_read(connector); + drm_edid = drm_edid_read_ddc(connector, ddc); drm_edid_connector_update(connector, drm_edid); if (!drm_edid) { DRM_ERROR("No EDID found on connector: %s.\n", connector->name); -- GitLab From a04bf34e0829f2c5d5f1ea7317daae2efa560fd1 Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Fri, 21 Feb 2025 14:19:12 -0500 Subject: [PATCH 887/989] MAINTAINERS: Update AMDGPU DML maintainers info Chaitanya is no longer with AMD, and the responsibility has been taken over by Austin. Signed-off-by: Aurabindo Pillai Acked-by: Alex Deucher Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher (cherry picked from commit a101fa705d016d46463dd4ce488671369c922bc2) Cc: stable@vger.kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f3fdc43bdd497..95bce73f3ef44 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1053,7 +1053,7 @@ T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/display/ AMD DISPLAY CORE - DML -M: Chaitanya Dhere +M: Austin Zheng M: Jun Lei S: Supported F: drivers/gpu/drm/amd/display/dc/dml/ -- GitLab From e8863f8b0316d8ee1e7e5291e8f2f72c91ac967d Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Thu, 6 Feb 2025 11:31:23 +0800 Subject: [PATCH 888/989] drm/amd/display: Disable PSR-SU on eDP panels [Why] PSR-SU may cause some glitching randomly on several panels. [How] Temporarily disable the PSR-SU and fallback to PSR1 for all eDP panels. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3388 Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Sun peng Li Signed-off-by: Tom Chung Signed-off-by: Roman Li Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 6deeefb820d0efb0b36753622fb982d03b37b3ad) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c index 45858bf1523d8..e140b7a04d724 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c @@ -54,7 +54,8 @@ static bool link_supports_psrsu(struct dc_link *link) if (amdgpu_dc_debug_mask & DC_DISABLE_PSR_SU) return false; - return dc_dmub_check_min_version(dc->ctx->dmub_srv->dmub); + /* Temporarily disable PSR-SU to avoid glitches */ + return false; } /* -- GitLab From b5f7242e49b927cfe488b369fa552f2eff579ef1 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 7 Feb 2025 15:26:19 -0500 Subject: [PATCH 889/989] drm/amd/display: add a quirk to enable eDP0 on DP1 [why] some board designs have eDP0 connected to DP1, need a way to enable support_edp0_on_dp1 flag, otherwise edp related features cannot work [how] do a dmi check during dm initialization to identify systems that require support_edp0_on_dp1. Optimize quirk table with callback functions to set quirk entries, retrieve_dmi_info can set quirks according to quirk entries Cc: Mario Limonciello Reviewed-by: Mario Limonciello Reviewed-by: Nicholas Kazlauskas Signed-off-by: Yilin Chen Signed-off-by: Zaeem Mohamed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit f6d17270d18a6a6753fff046330483d43f8405e4) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 69 +++++++++++++++++-- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5ddd21466e22f..9d9645a2d18ef 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1618,75 +1618,130 @@ static bool dm_should_disable_stutter(struct pci_dev *pdev) return false; } -static const struct dmi_system_id hpd_disconnect_quirk_table[] = { +struct amdgpu_dm_quirks { + bool aux_hpd_discon; + bool support_edp0_on_dp1; +}; + +static struct amdgpu_dm_quirks quirk_entries = { + .aux_hpd_discon = false, + .support_edp0_on_dp1 = false +}; + +static int edp0_on_dp1_callback(const struct dmi_system_id *id) +{ + quirk_entries.support_edp0_on_dp1 = true; + return 0; +} + +static int aux_hpd_discon_callback(const struct dmi_system_id *id) +{ + quirk_entries.aux_hpd_discon = true; + return 0; +} + +static const struct dmi_system_id dmi_quirk_table[] = { { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3660"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3260"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision 3460"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Tower Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Tower 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex SFF Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex SFF 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Micro Plus 7010"), }, }, { + .callback = aux_hpd_discon_callback, .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex Micro 7010"), }, }, + { + .callback = edp0_on_dp1_callback, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Elite mt645 G8 Mobile Thin Client"), + }, + }, + { + .callback = edp0_on_dp1_callback, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP EliteBook 665 16 inch G11 Notebook PC"), + }, + }, {} /* TODO: refactor this from a fixed table to a dynamic option */ }; -static void retrieve_dmi_info(struct amdgpu_display_manager *dm) +static void retrieve_dmi_info(struct amdgpu_display_manager *dm, struct dc_init_data *init_data) { - const struct dmi_system_id *dmi_id; + int dmi_id; + struct drm_device *dev = dm->ddev; dm->aux_hpd_discon_quirk = false; + init_data->flags.support_edp0_on_dp1 = false; + + dmi_id = dmi_check_system(dmi_quirk_table); - dmi_id = dmi_first_match(hpd_disconnect_quirk_table); - if (dmi_id) { + if (!dmi_id) + return; + + if (quirk_entries.aux_hpd_discon) { dm->aux_hpd_discon_quirk = true; - DRM_INFO("aux_hpd_discon_quirk attached\n"); + drm_info(dev, "aux_hpd_discon_quirk attached\n"); + } + if (quirk_entries.support_edp0_on_dp1) { + init_data->flags.support_edp0_on_dp1 = true; + drm_info(dev, "aux_hpd_discon_quirk attached\n"); } } @@ -1994,7 +2049,7 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0)) init_data.num_virtual_links = 1; - retrieve_dmi_info(&adev->dm); + retrieve_dmi_info(&adev->dm, &init_data); if (adev->dm.bb_from_dmub) init_data.bb_from_dmub = adev->dm.bb_from_dmub; -- GitLab From 4de141b8b1b7991b607f77e5f4580e1c67c24717 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 12 Feb 2025 14:49:36 -0500 Subject: [PATCH 890/989] drm/amd/display: Fix HPD after gpu reset [Why] DC is not using amdgpu_irq_get/put to manage the HPD interrupt refcounts. So when amdgpu_irq_gpu_reset_resume_helper() reprograms all of the IRQs, HPD gets disabled. [How] Use amdgpu_irq_get/put() for HPD init/fini in DM in order to sync refcounts Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Mario Limonciello Reviewed-by: Aurabindo Pillai Signed-off-by: Roman Li Signed-off-by: Zaeem Mohamed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit f3dde2ff7fcaacd77884502e8f572f2328e9c745) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c index 3390f0d8420a0..c4a7fd453e5fc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c @@ -894,6 +894,7 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) struct drm_device *dev = adev_to_drm(adev); struct drm_connector *connector; struct drm_connector_list_iter iter; + int i; drm_connector_list_iter_begin(dev, &iter); drm_for_each_connector_iter(connector, &iter) { @@ -920,6 +921,12 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) } } drm_connector_list_iter_end(&iter); + + /* Update reference counts for HPDs */ + for (i = DC_IRQ_SOURCE_HPD1; i <= adev->mode_info.num_hpd; i++) { + if (amdgpu_irq_get(adev, &adev->hpd_irq, i - DC_IRQ_SOURCE_HPD1)) + drm_err(dev, "DM_IRQ: Failed get HPD for source=%d)!\n", i); + } } /** @@ -935,6 +942,7 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) struct drm_device *dev = adev_to_drm(adev); struct drm_connector *connector; struct drm_connector_list_iter iter; + int i; drm_connector_list_iter_begin(dev, &iter); drm_for_each_connector_iter(connector, &iter) { @@ -960,4 +968,10 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) } } drm_connector_list_iter_end(&iter); + + /* Update reference counts for HPDs */ + for (i = DC_IRQ_SOURCE_HPD1; i <= adev->mode_info.num_hpd; i++) { + if (amdgpu_irq_put(adev, &adev->hpd_irq, i - DC_IRQ_SOURCE_HPD1)) + drm_err(dev, "DM_IRQ: Failed put HPD for source=%d!\n", i); + } } -- GitLab From d3c7059b6a8600fc62cd863f1ea203b8675e63e1 Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Thu, 20 Feb 2025 14:41:59 +0100 Subject: [PATCH 891/989] drm/amdgpu: init return value in amdgpu_ttm_clear_buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise an uninitialized value can be returned if amdgpu_res_cleared returns true for all regions. Possibly closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3812 Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality") Signed-off-by: Pierre-Eric Pelloux-Prayer Acked-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 7c62aacc3b452f73a1284198c81551035fac6d71) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 01ae2f88dec8c..262bd010a283d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2281,7 +2281,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; struct amdgpu_res_cursor cursor; u64 addr; - int r; + int r = 0; if (!adev->mman.buffer_funcs_enabled) return -EINVAL; -- GitLab From 68f3ea7ee199ef77551e090dfef5a49046ea8443 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:06 +0100 Subject: [PATCH 892/989] vmlinux.lds: Ensure that const vars with relocations are mapped R/O In the kernel, there are architectures (x86, arm64) that perform boot-time relocation (for KASLR) without relying on PIE codegen. In this case, all const global objects are emitted into .rodata, including const objects with fields that will be fixed up by the boot-time relocation code. This implies that .rodata (and .text in some cases) need to be writable at boot, but they will usually be mapped read-only as soon as the boot completes. When using PIE codegen, the compiler will emit const global objects into .data.rel.ro rather than .rodata if the object contains fields that need such fixups at boot-time. This permits the linker to annotate such regions as requiring read-write access only at load time, but not at execution time (in user space), while keeping .rodata truly const (in user space, this is important for reducing the CoW footprint of dynamic executables). This distinction does not matter for the kernel, but it does imply that const data will end up in writable memory if the .data.rel.ro sections are not treated in a special way, as they will end up in the writable .data segment by default. So emit .data.rel.ro into the .rodata segment. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-5-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 54504013c7491..337d3336e1756 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -457,7 +457,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) . = ALIGN((align)); \ .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ __start_rodata = .; \ - *(.rodata) *(.rodata.*) \ + *(.rodata) *(.rodata.*) *(.data.rel.ro*) \ SCHED_DATA \ RO_AFTER_INIT_DATA /* Read only after init */ \ . = ALIGN(8); \ -- GitLab From 73cfc53cc3b6380eccf013049574485f64cb83ca Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:07 +0100 Subject: [PATCH 893/989] objtool: Fix C jump table annotations for Clang A C jump table (such as the one used by the BPF interpreter) is a const global array of absolute code addresses, and this means that the actual values in the table may not be known until the kernel is booted (e.g., when using KASLR or when the kernel VA space is sized dynamically). When using PIE codegen, the compiler will default to placing such const global objects in .data.rel.ro (which is annotated as writable), rather than .rodata (which is annotated as read-only). As C jump tables are explicitly emitted into .rodata, this used to result in warnings for LoongArch builds (which uses PIE codegen for the entire kernel) like Warning: setting incorrect section attributes for .rodata..c_jump_table due to the fact that the explicitly specified .rodata section inherited the read-write annotation that the compiler uses for such objects when using PIE codegen. This warning was suppressed by explicitly adding the read-only annotation to the __attribute__((section(""))) string, by commit c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Unfortunately, this hack does not work on Clang's integrated assembler, which happily interprets the appended section type and permission specifiers as part of the section name, which therefore no longer matches the hard-coded pattern '.rodata..c_jump_table' that objtool expects, causing it to emit a warning kernel/bpf/core.o: warning: objtool: ___bpf_prog_run+0x20: sibling call from callable instruction with modified stack frame Work around this, by emitting C jump tables into .data.rel.ro instead, which is treated as .rodata by the linker script for all builds, not just PIE based ones. Fixes: c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Tested-by: Tiezhu Yang # on LoongArch Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-6-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- tools/objtool/check.c | 7 ++++--- tools/objtool/include/objtool/special.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b087de2f3e94b..0c25f3e429bba 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") +#define __annotate_jump_table __section(".data.rel.ro.c_jump_table") #else /* !CONFIG_OBJTOOL */ #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 497cb8dfb3eb3..1b5a1b3ea7a9f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2471,13 +2471,14 @@ static void mark_rodata(struct objtool_file *file) * * - .rodata: can contain GCC switch tables * - .rodata.: same, if -fdata-sections is being used - * - .rodata..c_jump_table: contains C annotated jump tables + * - .data.rel.ro.c_jump_table: contains C annotated jump tables * * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if (!strncmp(sec->name, ".rodata", 7) && - !strstr(sec->name, ".str1.")) { + if ((!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) || + !strncmp(sec->name, ".data.rel.ro", 12)) { sec->rodata = true; found = true; } diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index e7ee7ffccefd4..e049679bb17b2 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -10,7 +10,7 @@ #include #include -#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" +#define C_JUMP_TABLE_SECTION ".data.rel.ro.c_jump_table" struct special_alt { struct list_head list; -- GitLab From b4ae43b053537ec28f430c0ddb9b916ab296dbe5 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Tue, 18 Feb 2025 14:42:30 +0800 Subject: [PATCH 894/989] objtool: Add bch2_trans_unlocked_or_in_restart_error() to bcachefs noreturns Fix the following objtool warning during build time: fs/bcachefs/btree_cache.o: warning: objtool: btree_node_lock.constprop.0() falls through to next function bch2_recalc_btree_reserve() fs/bcachefs/btree_update.o: warning: objtool: bch2_trans_update_get_key_cache() falls through to next function need_whiteout_for_snapshot() bch2_trans_unlocked_or_in_restart_error() is an Obviously Correct (tm) panic() wrapper, add it to the list of known noreturns. Fixes: b318882022a8 ("bcachefs: bch2_trans_verify_not_unlocked_or_in_restart()") Reported-by: k2ci Signed-off-by: Youling Tang Reviewed-by: Kent Overstreet Link: https://lore.kernel.org/r/20250218064230.219997-1-youling.tang@linux.dev Signed-off-by: Josh Poimboeuf --- tools/objtool/noreturns.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index b2174894f9f71..6bb7edda3094d 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -19,7 +19,7 @@ NORETURN(__x64_sys_exit_group) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) -NORETURN(bch2_trans_unlocked_error) +NORETURN(bch2_trans_unlocked_or_in_restart_error) NORETURN(cpu_bringup_and_idle) NORETURN(cpu_startup_entry) NORETURN(do_exit) -- GitLab From 8fef0a3b17bb258130a4fcbcb5addf94b25e9ec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Feb 2025 06:02:23 -1000 Subject: [PATCH 895/989] sched_ext: Fix pick_task_scx() picking non-queued tasks when it's called without balance() a6250aa251ea ("sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()") added a workaround to handle the cases where pick_task_scx() is called without prececing balance_scx() which is due to a fair class bug where pick_taks_fair() may return NULL after a true return from balance_fair(). The workaround detects when pick_task_scx() is called without preceding balance_scx() and emulates SCX_RQ_BAL_KEEP and triggers kicking to avoid stalling. Unfortunately, the workaround code was testing whether @prev was on SCX to decide whether to keep the task running. This is incorrect as the task may be on SCX but no longer runnable. This could lead to a non-runnable task to be returned from pick_task_scx() which cause interesting confusions and failures. e.g. A common failure mode is the task ending up with (!on_rq && on_cpu) state which can cause potential wakers to busy loop, which can easily lead to deadlocks. Fix it by testing whether @prev has SCX_TASK_QUEUED set. This makes @prev_on_scx only used in one place. Open code the usage and improve the comment while at it. Signed-off-by: Tejun Heo Reported-by: Pat Cody Fixes: a6250aa251ea ("sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()") Cc: stable@vger.kernel.org # v6.12+ Acked-by: Andrea Righi --- kernel/sched/ext.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5a81d9a1e31f2..0f1da199cfc7c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3117,7 +3117,6 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; - bool prev_on_scx = prev->sched_class == &ext_sched_class; bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; bool kick_idle = false; @@ -3137,14 +3136,18 @@ static struct task_struct *pick_task_scx(struct rq *rq) * if pick_task_scx() is called without preceding balance_scx(). */ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev_on_scx) { + if (prev->scx.flags & SCX_TASK_QUEUED) { keep_prev = true; } else { keep_prev = false; kick_idle = true; } - } else if (unlikely(keep_prev && !prev_on_scx)) { - /* only allowed during transitions */ + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); keep_prev = false; } -- GitLab From a26b24b2e21f6222635a95426b9ef9eec63d69b1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:13 -0800 Subject: [PATCH 896/989] perf/x86/intel: Use better start period for frequency mode Freqency mode is the current default mode of Linux perf. A period of 1 is used as a starting period. The period is auto-adjusted on each tick or an overflow, to meet the frequency target. The start period of 1 is too low and may trigger some issues: - Many HWs do not support period 1 well. https://lore.kernel.org/lkml/875xs2oh69.ffs@tglx/ - For an event that occurs frequently, period 1 is too far away from the real period. Lots of samples are generated at the beginning. The distribution of samples may not be even. - A low starting period for frequently occurring events also challenges virtualization, which has a longer path to handle a PMI. The limit_period value only checks the minimum acceptable value for HW. It cannot be used to set the start period, because some events may need a very low period. The limit_period cannot be set too high. It doesn't help with the events that occur frequently. It's hard to find a universal starting period for all events. The idea implemented by this patch is to only give an estimate for the popular HW and HW cache events. For the rest of the events, start from the lowest possible recommended value. Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250117151913.3043942-3-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 85 ++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cdcebf30468a0..cdb19e3ba3aa3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3952,6 +3952,85 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) return test_bit(idx, (unsigned long *)&intel_cap->capabilities); } +static u64 intel_pmu_freq_start_period(struct perf_event *event) +{ + int type = event->attr.type; + u64 config, factor; + s64 start; + + /* + * The 127 is the lowest possible recommended SAV (sample after value) + * for a 4000 freq (default freq), according to the event list JSON file. + * Also, assume the workload is idle 50% time. + */ + factor = 64 * 4000; + if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE) + goto end; + + /* + * The estimation of the start period in the freq mode is + * based on the below assumption. + * + * For a cycles or an instructions event, 1GHZ of the + * underlying platform, 1 IPC. The workload is idle 50% time. + * The start period = 1,000,000,000 * 1 / freq / 2. + * = 500,000,000 / freq + * + * Usually, the branch-related events occur less than the + * instructions event. According to the Intel event list JSON + * file, the SAV (sample after value) of a branch-related event + * is usually 1/4 of an instruction event. + * The start period of branch-related events = 125,000,000 / freq. + * + * The cache-related events occurs even less. The SAV is usually + * 1/20 of an instruction event. + * The start period of cache-related events = 25,000,000 / freq. + */ + config = event->attr.config & PERF_HW_EVENT_MASK; + if (type == PERF_TYPE_HARDWARE) { + switch (config) { + case PERF_COUNT_HW_CPU_CYCLES: + case PERF_COUNT_HW_INSTRUCTIONS: + case PERF_COUNT_HW_BUS_CYCLES: + case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: + case PERF_COUNT_HW_STALLED_CYCLES_BACKEND: + case PERF_COUNT_HW_REF_CPU_CYCLES: + factor = 500000000; + break; + case PERF_COUNT_HW_BRANCH_INSTRUCTIONS: + case PERF_COUNT_HW_BRANCH_MISSES: + factor = 125000000; + break; + case PERF_COUNT_HW_CACHE_REFERENCES: + case PERF_COUNT_HW_CACHE_MISSES: + factor = 25000000; + break; + default: + goto end; + } + } + + if (type == PERF_TYPE_HW_CACHE) + factor = 25000000; +end: + /* + * Usually, a prime or a number with less factors (close to prime) + * is chosen as an SAV, which makes it less likely that the sampling + * period synchronizes with some periodic event in the workload. + * Minus 1 to make it at least avoiding values near power of twos + * for the default freq. + */ + start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1; + + if (start > x86_pmu.max_period) + start = x86_pmu.max_period; + + if (x86_pmu.limit_period) + x86_pmu.limit_period(event, &start); + + return start; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3963,6 +4042,12 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.freq && event->attr.sample_freq) { + event->hw.sample_period = intel_pmu_freq_start_period(event); + event->hw.last_period = event->hw.sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + } + if (event->attr.precise_ip) { if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; -- GitLab From 1f7a4f98c11fbeb18ed21f3b3a497e90a50ad2e0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 15:52:21 +0100 Subject: [PATCH 897/989] sunrpc: suppress warnings for unused procfs functions There is a warning about unused variables when building with W=1 and no procfs: net/sunrpc/cache.c:1660:30: error: 'cache_flush_proc_ops' defined but not used [-Werror=unused-const-variable=] 1660 | static const struct proc_ops cache_flush_proc_ops = { | ^~~~~~~~~~~~~~~~~~~~ net/sunrpc/cache.c:1622:30: error: 'content_proc_ops' defined but not used [-Werror=unused-const-variable=] 1622 | static const struct proc_ops content_proc_ops = { | ^~~~~~~~~~~~~~~~ net/sunrpc/cache.c:1598:30: error: 'cache_channel_proc_ops' defined but not used [-Werror=unused-const-variable=] 1598 | static const struct proc_ops cache_channel_proc_ops = { | ^~~~~~~~~~~~~~~~~~~~~~ These are used inside of an #ifdef, so replacing that with an IS_ENABLED() check lets the compiler see how they are used while still dropping them during dead code elimination. Fixes: dbf847ecb631 ("knfsd: allow cache_register to return error on failure") Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Arnd Bergmann Signed-off-by: Anna Schumaker --- net/sunrpc/cache.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index cb279eb9ac4ba..7ce5e28a6c031 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1674,12 +1674,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) } } -#ifdef CONFIG_PROC_FS static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; + if (!IS_ENABLED(CONFIG_PROC_FS)) + return 0; + sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) @@ -1707,12 +1709,6 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) remove_cache_proc_entries(cd); return -ENOMEM; } -#else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) -{ - return 0; -} -#endif void __init cache_initialize(void) { -- GitLab From 9084ed79ddaaaa1ec01cd304af9fb532c26252db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 20 Feb 2025 14:29:36 -0500 Subject: [PATCH 898/989] lsm,nfs: fix memory leak of lsm_context commit b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") did not preserve the lsm id for subsequent release calls, which results in a memory leak. Fix it by saving the lsm id in the nfs4_label and providing it on the subsequent release call. Fixes: b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Casey Schaufler Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 7 ++++--- include/linux/nfs4.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c25ecdb76d304..6e95db6c17e92 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -133,6 +133,7 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (err) return NULL; + label->lsmid = shim.id; label->label = shim.context; label->len = shim.len; return label; @@ -145,7 +146,7 @@ nfs4_label_release_security(struct nfs4_label *label) if (label) { shim.context = label->label; shim.len = label->len; - shim.id = LSM_ID_UNDEF; + shim.id = label->lsmid; security_release_secctx(&shim); } } @@ -6272,7 +6273,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_fattr fattr = { @@ -6377,7 +6378,7 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel = {0, 0, buflen, (char *)buf }; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; struct nfs_fattr *fattr; int status; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 71fbebfa43c7e..9ac83ca883266 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -47,6 +47,7 @@ struct nfs4_acl { struct nfs4_label { uint32_t lfs; uint32_t pi; + u32 lsmid; u32 len; char *label; }; -- GitLab From 96f41f644c4885761b0d117fc36dc5dcf92e15ec Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Sun, 5 Jan 2025 17:27:40 +0000 Subject: [PATCH 899/989] x86/of: Don't use DTB for SMP setup if ACPI is enabled There are cases when it is useful to use both ACPI and DTB provided by the bootloader, however in such cases we should make sure to prevent conflicts between the two. Namely, don't try to use DTB for SMP setup if ACPI is enabled. Precisely, this prevents at least: - incorrectly calling register_lapic_address(APIC_DEFAULT_PHYS_BASE) after the LAPIC was already successfully enumerated via ACPI, causing noisy kernel warnings and probably potential real issues as well - failed IOAPIC setup in the case when IOAPIC is enumerated via mptable instead of ACPI (e.g. with acpi=noirq), due to mpparse_parse_smp_config() overridden by x86_dtb_parse_smp_config() Signed-off-by: Dmytro Maluka Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250105172741.3476758-2-dmaluka@chromium.org --- arch/x86/kernel/devicetree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 59d23cdf4ed0f..dd8748c45529a 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ /* * Architecture specific OF callbacks. */ +#include #include #include #include @@ -313,6 +314,6 @@ void __init x86_flattree_get_config(void) if (initial_dtb) early_memunmap(dt, map_len); #endif - if (of_have_populated_dt()) + if (acpi_disabled && of_have_populated_dt()) x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; } -- GitLab From bebe35bb738b573c32a5033499cd59f20293f2a3 Mon Sep 17 00:00:00 2001 From: Russell Senior Date: Tue, 25 Feb 2025 22:31:20 +0100 Subject: [PATCH 900/989] x86/CPU: Fix warm boot hang regression on AMD SC1100 SoC systems I still have some Soekris net4826 in a Community Wireless Network I volunteer with. These devices use an AMD SC1100 SoC. I am running OpenWrt on them, which uses a patched kernel, that naturally has evolved over time. I haven't updated the ones in the field in a number of years (circa 2017), but have one in a test bed, where I have intermittently tried out test builds. A few years ago, I noticed some trouble, particularly when "warm booting", that is, doing a reboot without removing power, and noticed the device was hanging after the kernel message: [ 0.081615] Working around Cyrix MediaGX virtual DMA bugs. If I removed power and then restarted, it would boot fine, continuing through the message above, thusly: [ 0.081615] Working around Cyrix MediaGX virtual DMA bugs. [ 0.090076] Enable Memory-Write-back mode on Cyrix/NSC processor. [ 0.100000] Enable Memory access reorder on Cyrix/NSC processor. [ 0.100070] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.110058] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0, 1GB 0 [ 0.120037] CPU: NSC Geode(TM) Integrated Processor by National Semi (family: 0x5, model: 0x9, stepping: 0x1) [...] In order to continue using modern tools, like ssh, to interact with the software on these old devices, I need modern builds of the OpenWrt firmware on the devices. I confirmed that the warm boot hang was still an issue in modern OpenWrt builds (currently using a patched linux v6.6.65). Last night, I decided it was time to get to the bottom of the warm boot hang, and began bisecting. From preserved builds, I narrowed down the bisection window from late February to late May 2019. During this period, the OpenWrt builds were using 4.14.x. I was able to build using period-correct Ubuntu 18.04.6. After a number of bisection iterations, I identified a kernel bump from 4.14.112 to 4.14.113 as the commit that introduced the warm boot hang. https://github.com/openwrt/openwrt/commit/07aaa7e3d62ad32767d7067107db64b6ade81537 Looking at the upstream changes in the stable kernel between 4.14.112 and 4.14.113 (tig v4.14.112..v4.14.113), I spotted a likely suspect: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=20afb90f730982882e65b01fb8bdfe83914339c5 So, I tried reverting just that kernel change on top of the breaking OpenWrt commit, and my warm boot hang went away. Presumably, the warm boot hang is due to some register not getting cleared in the same way that a loss of power does. That is approximately as much as I understand about the problem. More poking/prodding and coaching from Jonas Gorski, it looks like this test patch fixes the problem on my board: Tested against v6.6.67 and v4.14.113. Fixes: 18fb053f9b82 ("x86/cpu/cyrix: Use correct macros for Cyrix calls on Geode processors") Debugged-by: Jonas Gorski Signed-off-by: Russell Senior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/CAHP3WfOgs3Ms4Z+L9i0-iBOE21sdMk5erAiJurPjnrL9LSsgRA@mail.gmail.com Cc: Matthew Whitehead Cc: Thomas Gleixner --- arch/x86/kernel/cpu/cyrix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1b..dfec2c61e3547 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ -- GitLab From 9de7695925d5d2d2085681ba935857246eb2817d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 22:32:33 +0100 Subject: [PATCH 901/989] x86/irq: Define trace events conditionally When both of X86_LOCAL_APIC and X86_THERMAL_VECTOR are disabled, the irq tracing produces a W=1 build warning for the tracing definitions: In file included from include/trace/trace_events.h:27, from include/trace/define_trace.h:113, from arch/x86/include/asm/trace/irq_vectors.h:383, from arch/x86/kernel/irq.c:29: include/trace/stages/init.h:2:23: error: 'str__irq_vectors__trace_system_name' defined but not used [-Werror=unused-const-variable=] Make the tracepoints conditional on the same symbosl that guard their usage. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225213236.3141752-1-arnd@kernel.org --- arch/x86/kernel/irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 385e3a5fc3045..feca4f20b06aa 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -25,8 +25,10 @@ #include #include +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR) #define CREATE_TRACE_POINTS #include +#endif DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -- GitLab From 68a9b0e313302451468c0b0eda53c383fa51a8f4 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Tue, 24 Dec 2024 22:55:16 +0800 Subject: [PATCH 902/989] perf/x86/rapl: Add support for Intel Arrow Lake U Add Arrow Lake U model for RAPL: $ ls -1 /sys/devices/power/events/ energy-cores energy-cores.scale energy-cores.unit energy-gpu energy-gpu.scale energy-gpu.unit energy-pkg energy-pkg.scale energy-pkg.unit energy-psys energy-psys.scale energy-psys.unit The same output as ArrowLake: $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/ Signed-off-by: Aaron Ma Signed-off-by: Ingo Molnar Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20241224145516.349028-1-aaron.ma@canonical.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 4952faf03e82d..6941f4811bec1 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -879,6 +879,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), {}, }; -- GitLab From 0f6750b15ffdf274668b12824b09bd49ea854e18 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Thu, 19 Dec 2024 08:52:27 -0700 Subject: [PATCH 903/989] x86/entry: Fix kernel-doc warning The do_int80_emulation() function is missing a kernel-doc formatted description of its argument. This is causing a warning when building with W=1. Add a brief description of the argument to satisfy kernel-doc. Reported-by: kernel test robot Signed-off-by: Daniel Sneddon Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241219155227.685692-1-daniel.sneddon@linux.intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202412131236.a5HhOqXo-lkp@intel.com/ --- arch/x86/entry/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 94941c5a10ac1..14db5b85114c1 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -190,6 +190,7 @@ static __always_inline bool int80_is_external(void) /** * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in -- GitLab From f8c857238a392f21d5726d07966f6061007c8d4f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 25 Feb 2025 14:32:14 -0800 Subject: [PATCH 904/989] uprobes: Remove too strict lockdep_assert() condition in hprobe_expire() hprobe_expire() is used to atomically switch pending uretprobe instance (struct return_instance) from being SRCU protected to be refcounted. This can be done from background timer thread, or synchronously within current thread when task is forked. In the former case, return_instance has to be protected through RCU read lock, and that's what hprobe_expire() used to check with lockdep_assert(rcu_read_lock_held()). But in the latter case (hprobe_expire() called from dup_utask()) there is no RCU lock being held, and it's both unnecessary and incovenient. Inconvenient due to the intervening memory allocations inside dup_return_instance()'s loop. Unnecessary because dup_utask() is called synchronously in current thread, and no uretprobe can run at that point, so return_instance can't be freed either. So drop rcu_read_lock_held() condition, and expand corresponding comment to explain necessary lifetime guarantees. lockdep_assert()-detected issue is a false positive. Fixes: dd1a7567784e ("uprobes: SRCU-protect uretprobe lifetime (with timeout)") Reported-by: Breno Leitao Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225223214.2970740-1-andrii@kernel.org --- kernel/events/uprobes.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index af53fbd2d12c4..b4ca8898fe178 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -767,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { -- GitLab From 66cb85c441cd9c44b193ff75b4d0358fccdc6b9c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 25 Feb 2025 22:25:00 +0000 Subject: [PATCH 905/989] cifs: Fix the smb1 readv callback to correctly call netfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix cifs_readv_callback() to call netfs_read_subreq_terminated() rather than queuing the subrequest work item (which is unset). Also call the I/O progress tracepoint. cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Reported-by: Jean-Christophe Guillain Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219793 Tested-by: Jean-Christophe Guillain Tested-by: Pali Rohár Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: David Howells Signed-off-by: Steve French --- fs/smb/client/cifssmb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3feaa0f681699..d07682020c645 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1338,7 +1338,8 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->credits.value = 0; rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; - queue_work(cifsiod_wq, &rdata->subreq.work); + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(&rdata->subreq); release_mid(mid); add_credits(server, &credits, 0); } -- GitLab From bab3a6e9ffd600f9db0ebaf8f45e1c6111cf314c Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Mon, 24 Feb 2025 06:17:16 +0100 Subject: [PATCH 906/989] net: ethernet: ti: am65-cpsw: select PAGE_POOL am65-cpsw uses page_pool_dev_alloc_pages(), thus needs PAGE_POOL selected to avoid linker errors. This is missing since the driver started to use page_pool helpers in 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Fixes: 8acacc40f733 ("net: ethernet: ti: am65-cpsw: Add minimal XDP support") Signed-off-by: Sascha Hauer Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250224-net-am654-nuss-kconfig-v2-1-c124f4915c92@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 0d5a862cd78a6..3a13d60a947a8 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -99,6 +99,7 @@ config TI_K3_AM65_CPSW_NUSS select NET_DEVLINK select TI_DAVINCI_MDIO select PHYLINK + select PAGE_POOL select TI_K3_CPPI_DESC_POOL imply PHY_TI_GMII_SEL depends on TI_K3_AM65_CPTS || !TI_K3_AM65_CPTS -- GitLab From 18912c520674ec4d920fe3826e7e4fefeecdf5ae Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 24 Feb 2025 09:44:01 -0800 Subject: [PATCH 907/989] tcp: devmem: don't write truncated dmabuf CMSGs to userspace Currently, we report -ETOOSMALL (err) only on the first iteration (!sent). When we get put_cmsg error after a bunch of successful put_cmsg calls, we don't signal the error at all. This might be confusing on the userspace side which will see truncated CMSGs but no MSG_CTRUNC signal. Consider the following case: - sizeof(struct cmsghdr) = 16 - sizeof(struct dmabuf_cmsg) = 24 - total cmsg size (CMSG_LEN) = 40 (16+24) When calling recvmsg with msg_controllen=60, the userspace will receive two(!) dmabuf_cmsg(s), the first one will be a valid one and the second one will be silently truncated. There is no easy way to discover the truncation besides doing something like "cm->cmsg_len != CMSG_LEN(sizeof(dmabuf_cmsg))". Introduce new put_devmem_cmsg wrapper that reports an error instead of doing the truncation. Mina suggests that it's the intended way this API should work. Note that we might now report MSG_CTRUNC when the users (incorrectly) call us with msg_control == NULL. Fixes: 8f0b3cc9a4c1 ("tcp: RX path for devmem TCP") Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250224174401.3582695-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 ++ net/core/scm.c | 10 ++++++++++ net/ipv4/tcp.c | 26 ++++++++++---------------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index d18cc47e89bd0..c3322eb3d6865 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data); struct timespec64; struct __kernel_timespec; diff --git a/net/core/scm.c b/net/core/scm.c index 4f6a14babe5ae..733c0cbd393d2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -282,6 +282,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c416..d74281eca14f0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2438,14 +2438,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2499,16 +2497,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); -- GitLab From f865c24bc55158313d5779fc81116023a6940ca3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 24 Feb 2025 19:11:50 +0100 Subject: [PATCH 908/989] mptcp: always handle address removal under msk socket lock Syzkaller reported a lockdep splat in the PM control path: WARNING: CPU: 0 PID: 6693 at ./include/net/sock.h:1711 sock_owned_by_me include/net/sock.h:1711 [inline] WARNING: CPU: 0 PID: 6693 at ./include/net/sock.h:1711 msk_owned_by_me net/mptcp/protocol.h:363 [inline] WARNING: CPU: 0 PID: 6693 at ./include/net/sock.h:1711 mptcp_pm_nl_addr_send_ack+0x57c/0x610 net/mptcp/pm_netlink.c:788 Modules linked in: CPU: 0 UID: 0 PID: 6693 Comm: syz.0.205 Not tainted 6.14.0-rc2-syzkaller-00303-gad1b832bf1cf #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 12/27/2024 RIP: 0010:sock_owned_by_me include/net/sock.h:1711 [inline] RIP: 0010:msk_owned_by_me net/mptcp/protocol.h:363 [inline] RIP: 0010:mptcp_pm_nl_addr_send_ack+0x57c/0x610 net/mptcp/pm_netlink.c:788 Code: 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc e8 ca 7b d3 f5 eb b9 e8 c3 7b d3 f5 90 0f 0b 90 e9 dd fb ff ff e8 b5 7b d3 f5 90 <0f> 0b 90 e9 3e fb ff ff 44 89 f1 80 e1 07 38 c1 0f 8c eb fb ff ff RSP: 0000:ffffc900034f6f60 EFLAGS: 00010283 RAX: ffffffff8bee3c2b RBX: 0000000000000001 RCX: 0000000000080000 RDX: ffffc90004d42000 RSI: 000000000000a407 RDI: 000000000000a408 RBP: ffffc900034f7030 R08: ffffffff8bee37f6 R09: 0100000000000000 R10: dffffc0000000000 R11: ffffed100bcc62e4 R12: ffff88805e6316e0 R13: ffff88805e630c00 R14: dffffc0000000000 R15: ffff88805e630c00 FS: 00007f7e9a7e96c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2fd18ff8 CR3: 0000000032c24000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mptcp_pm_remove_addr+0x103/0x1d0 net/mptcp/pm.c:59 mptcp_pm_remove_anno_addr+0x1f4/0x2f0 net/mptcp/pm_netlink.c:1486 mptcp_nl_remove_subflow_and_signal_addr net/mptcp/pm_netlink.c:1518 [inline] mptcp_pm_nl_del_addr_doit+0x118d/0x1af0 net/mptcp/pm_netlink.c:1629 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb1f/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x206/0x480 net/netlink/af_netlink.c:2543 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1322 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1348 netlink_sendmsg+0x8de/0xcb0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:718 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:733 ____sys_sendmsg+0x53a/0x860 net/socket.c:2573 ___sys_sendmsg net/socket.c:2627 [inline] __sys_sendmsg+0x269/0x350 net/socket.c:2659 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7e9998cde9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f7e9a7e9038 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f7e99ba5fa0 RCX: 00007f7e9998cde9 RDX: 000000002000c094 RSI: 0000400000000000 RDI: 0000000000000007 RBP: 00007f7e99a0e2a0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f7e99ba5fa0 R15: 00007fff49231088 Indeed the PM can try to send a RM_ADDR over a msk without acquiring first the msk socket lock. The bugged code-path comes from an early optimization: when there are no subflows, the PM should (usually) not send RM_ADDR notifications. The above statement is incorrect, as without locks another process could concurrent create a new subflow and cause the RM_ADDR generation. Additionally the supposed optimization is not very effective even performance-wise, as most mptcp sockets should have at least one subflow: the MPC one. Address the issue removing the buggy code path, the existing "slow-path" will handle correctly even the edge case. Fixes: b6c08380860b ("mptcp: remove addr and subflow in PM netlink") Cc: stable@vger.kernel.org Reported-by: syzbot+cd3ce3d03a3393ae9700@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/546 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250224-net-mptcp-misc-fixes-v1-1-f550f636b435@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 572d160edca33..c0e47f4f7b1aa 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1514,11 +1514,6 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, if (mptcp_pm_is_userspace(msk)) goto next; - if (list_empty(&msk->conn_list)) { - mptcp_pm_remove_anno_addr(msk, addr, false); - goto next; - } - lock_sock(sk); remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && -- GitLab From 8668860b0ad32a13fcd6c94a0995b7aa7638c9ef Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 24 Feb 2025 19:11:51 +0100 Subject: [PATCH 909/989] mptcp: reset when MPTCP opts are dropped after join Before this patch, if the checksum was not used, the subflow was only reset if map_data_len was != 0. If there were no MPTCP options or an invalid mapping, map_data_len was not set to the data len, and then the subflow was not reset as it should have been, leaving the MPTCP connection in a wrong fallback mode. This map_data_len condition has been introduced to handle the reception of the infinite mapping. Instead, a new dedicated mapping error could have been returned and treated as a special case. However, the commit 31bf11de146c ("mptcp: introduce MAPPING_BAD_CSUM") has been introduced by Paolo Abeni soon after, and backported later on to stable. It better handle the csum case, and it means the exception for valid_csum_seen in subflow_can_fallback(), plus this one for the infinite mapping in subflow_check_data_avail(), are no longer needed. In other words, the code can be simplified there: a fallback should only be done if msk->allow_infinite_fallback is set. This boolean is set to false once MPTCP-specific operations acting on the whole MPTCP connection vs the initial path have been done, e.g. a second path has been created, or an MPTCP re-injection -- yes, possible even with a single subflow. The subflow_can_fallback() helper can then be dropped, and replaced by this single condition. This also makes the code clearer: a fallback should only be done if it is possible to do so. While at it, no need to set map_data_len to 0 in get_mapping_status() for the infinite mapping case: it will be set to skb->len just after, at the end of subflow_check_data_avail(), and not read in between. Fixes: f8d4bcacff3b ("mptcp: infinite mapping receiving") Cc: stable@vger.kernel.org Reported-by: Chester A. Unal Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/544 Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Tested-by: Chester A. Unal Link: https://patch.msgid.link/20250224-net-mptcp-misc-fixes-v1-2-f550f636b435@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index dfcbef9c46246..9f18217dddc86 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1142,7 +1142,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1286,18 +1285,6 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss mptcp_schedule_work(sk); } -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); - - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return READ_ONCE(msk->allow_infinite_fallback); -} - static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1393,7 +1380,7 @@ static bool subflow_check_data_avail(struct sock *ssk) return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ -- GitLab From db75a16813aabae3b78c06b1b99f5e314c1f55d3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 24 Feb 2025 19:11:52 +0100 Subject: [PATCH 910/989] mptcp: safety check before fallback Recently, some fallback have been initiated, while the connection was not supposed to fallback. Add a safety check with a warning to detect when an wrong attempt to fallback is being done. This should help detecting any future issues quicker. Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250224-net-mptcp-misc-fixes-v1-3-f550f636b435@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f6a207958459d..ad21925af0612 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1199,6 +1199,8 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } + if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) + return; set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -- GitLab From a6aa36e957a1bfb5341986dec32d013d23228fe1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 14 Feb 2025 13:14:34 +0900 Subject: [PATCH 911/989] block: Remove zone write plugs when handling native zone append writes For devices that natively support zone append operations, REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging and are immediately issued to the zoned device. This means that there is no write pointer offset tracking done for these operations and that a zone write plug is not necessary. However, when receiving a zone append BIO, we may already have a zone write plug for the target zone if that zone was previously partially written using regular write operations. In such case, since the write pointer offset of the zone write plug is not incremented by the amount of sectors appended to the zone, 2 issues arise: 1) we risk leaving the plug in the disk hash table if the zone is fully written using zone append or regular write operations, because the write pointer offset will never reach the "zone full" state. 2) Regular write operations that are issued after zone append operations will always be failed by blk_zone_wplug_prepare_bio() as the write pointer alignment check will fail, even if the user correctly accounted for the zone append operations and issued the regular writes with a correct sector. Avoid these issues by immediately removing the zone write plug of zones that are the target of zone append operations when blk_zone_plug_bio() is called. The new function blk_zone_wplug_handle_native_zone_append() implements this for devices that natively support zone append. The removal of the zone write plug using disk_remove_zone_wplug() requires aborting all plugged regular write using disk_zone_wplug_abort() as otherwise the plugged write BIOs would never be executed (with the plug removed, the completion path will never see again the zone write plug as disk_get_zone_wplug() will return NULL). Rate-limited warnings are added to blk_zone_wplug_handle_native_zone_append() and to disk_zone_wplug_abort() to signal this. Since blk_zone_wplug_handle_native_zone_append() is called in the hot path for operations that will not be plugged, disk_get_zone_wplug() is optimized under the assumption that a user issuing zone append operations is not at the same time issuing regular writes and that there are no hashed zone write plugs. The struct gendisk atomic counter nr_zone_wplugs is added to check this, with this counter incremented in disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug(). To be consistent with this fix, we do not need to fill the zone write plug hash table with zone write plugs for zones that are partially written for a device that supports native zone append operations. So modify blk_revalidate_seq_zone() to return early to avoid allocating and inserting a zone write plug for partially written sequential zones if the device natively supports zone append. Reported-by: Jorgen Hansen Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Tested-by: Jorgen Hansen Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 76 ++++++++++++++++++++++++++++++++++++++---- include/linux/blkdev.h | 7 ++-- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 761ea662ddc34..0c77244a35c92 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -410,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } -static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, - sector_t sector) +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); @@ -437,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, return NULL; } +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = @@ -503,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk, zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } @@ -593,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } @@ -1040,6 +1056,47 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1096,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: - if (!bdev_emulates_zone_append(bdev)) + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); return false; + } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: @@ -1284,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, { unsigned int i; + atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); @@ -1338,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) } } + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; @@ -1550,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, } /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. */ - if (!disk->zone_wplugs_hash) + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 58ff5aca83b67..d37751789bf58 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,10 +196,11 @@ struct gendisk { unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; - unsigned int zone_wplugs_hash_bits; - spinlock_t zone_wplugs_lock; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; - struct hlist_head *zone_wplugs_hash; + struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- GitLab From 79990cf5e7aded76d0c092c9f5ed31eb1c75e02c Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Mon, 24 Feb 2025 11:06:41 -0800 Subject: [PATCH 912/989] ice: Fix deinitializing VF in error path If ice_ena_vfs() fails after calling ice_create_vf_entries(), it frees all VFs without removing them from snapshot PF-VF mailbox list, leading to list corruption. Reproducer: devlink dev eswitch set $PF1_PCI mode switchdev ip l s $PF1 up ip l s $PF1 promisc on sleep 1 echo 1 > /sys/class/net/$PF1/device/sriov_numvfs sleep 1 echo 1 > /sys/class/net/$PF1/device/sriov_numvfs Trace (minimized): list_add corruption. next->prev should be prev (ffff8882e241c6f0), but was 0000000000000000. (next=ffff888455da1330). kernel BUG at lib/list_debug.c:29! RIP: 0010:__list_add_valid_or_report+0xa6/0x100 ice_mbx_init_vf_info+0xa7/0x180 [ice] ice_initialize_vf_entry+0x1fa/0x250 [ice] ice_sriov_configure+0x8d7/0x1520 [ice] ? __percpu_ref_switch_mode+0x1b1/0x5d0 ? __pfx_ice_sriov_configure+0x10/0x10 [ice] Sometimes a KASAN report can be seen instead with a similar stack trace: BUG: KASAN: use-after-free in __list_add_valid_or_report+0xf1/0x100 VFs are added to this list in ice_mbx_init_vf_info(), but only removed in ice_free_vfs(). Move the removing to ice_free_vf_entries(), which is also being called in other places where VFs are being removed (including ice_free_vfs() itself). Fixes: 8cd8a6b17d27 ("ice: move VF overflow message count into struct ice_mbx_vf_info") Reported-by: Sujai Buvaneswaran Closes: https://lore.kernel.org/intel-wired-lan/PH0PR11MB50138B635F2E5CEB7075325D961F2@PH0PR11MB5013.namprd11.prod.outlook.com Reviewed-by: Martyna Szapar-Mudlaw Signed-off-by: Marcin Szycik Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250224190647.3601930-2-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_sriov.c | 5 +---- drivers/net/ethernet/intel/ice/ice_vf_lib.c | 8 ++++++++ drivers/net/ethernet/intel/ice/ice_vf_lib_private.h | 1 + 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index b83f99c01d91b..8aabf7749aa5e 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -36,6 +36,7 @@ static void ice_free_vf_entries(struct ice_pf *pf) hash_for_each_safe(vfs->table, bkt, tmp, vf, entry) { hash_del_rcu(&vf->entry); + ice_deinitialize_vf_entry(vf); ice_put_vf(vf); } } @@ -193,10 +194,6 @@ void ice_free_vfs(struct ice_pf *pf) wr32(hw, GLGEN_VFLRSTAT(reg_idx), BIT(bit_idx)); } - /* clear malicious info since the VF is getting released */ - if (!ice_is_feature_supported(pf, ICE_F_MBX_LIMIT)) - list_del(&vf->mbx_info.list_entry); - mutex_unlock(&vf->cfg_lock); } diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index c7c0c2f50c265..815ad0bfe8326 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -1036,6 +1036,14 @@ void ice_initialize_vf_entry(struct ice_vf *vf) mutex_init(&vf->cfg_lock); } +void ice_deinitialize_vf_entry(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + + if (!ice_is_feature_supported(pf, ICE_F_MBX_LIMIT)) + list_del(&vf->mbx_info.list_entry); +} + /** * ice_dis_vf_qs - Disable the VF queues * @vf: pointer to the VF structure diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h b/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h index 0c7e77c0a09fa..5392b04049862 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib_private.h @@ -24,6 +24,7 @@ #endif void ice_initialize_vf_entry(struct ice_vf *vf); +void ice_deinitialize_vf_entry(struct ice_vf *vf); void ice_dis_vf_qs(struct ice_vf *vf); int ice_check_vf_init(struct ice_vf *vf); enum virtchnl_status_code ice_err_to_virt_err(int err); -- GitLab From 5c07be96d8b3f8447e980f29b967bf2e1d7ac732 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Mon, 24 Feb 2025 11:06:42 -0800 Subject: [PATCH 913/989] ice: Avoid setting default Rx VSI twice in switchdev setup As part of switchdev environment setup, uplink VSI is configured as default for both Tx and Rx. Default Rx VSI is also used by promiscuous mode. If promisc mode is enabled and an attempt to enter switchdev mode is made, the setup will fail because Rx VSI is already configured as default (rule exists). Reproducer: devlink dev eswitch set $PF1_PCI mode switchdev ip l s $PF1 up ip l s $PF1 promisc on echo 1 > /sys/class/net/$PF1/device/sriov_numvfs In switchdev setup, use ice_set_dflt_vsi() instead of plain ice_cfg_dflt_vsi(), which avoids repeating setting default VSI for Rx if it's already configured. Fixes: 50d62022f455 ("ice: default Tx rule instead of to queue") Reported-by: Sujai Buvaneswaran Closes: https://lore.kernel.org/intel-wired-lan/PH0PR11MB50138B635F2E5CEB7075325D961F2@PH0PR11MB5013.namprd11.prod.outlook.com Reviewed-by: Martyna Szapar-Mudlaw Signed-off-by: Marcin Szycik Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250224190647.3601930-3-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index fb527434b58b1..d649c197cf673 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -38,8 +38,7 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) if (ice_vsi_add_vlan_zero(uplink_vsi)) goto err_vlan_zero; - if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, - ICE_FLTR_RX)) + if (ice_set_dflt_vsi(uplink_vsi)) goto err_def_rx; if (ice_cfg_dflt_vsi(uplink_vsi->port_info, uplink_vsi->idx, true, -- GitLab From c6124f6fd3ca37d53ec5cbf62f9d9130ef439eca Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 24 Feb 2025 11:06:44 -0800 Subject: [PATCH 914/989] iavf: fix circular lock dependency with netdev_lock We have recently seen reports of lockdep circular lock dependency warnings when loading the iAVF driver: [ 1504.790308] ====================================================== [ 1504.790309] WARNING: possible circular locking dependency detected [ 1504.790310] 6.13.0 #net_next_rt.c2933b2befe2.el9 Not tainted [ 1504.790311] ------------------------------------------------------ [ 1504.790312] kworker/u128:0/13566 is trying to acquire lock: [ 1504.790313] ffff97d0e4738f18 (&dev->lock){+.+.}-{4:4}, at: register_netdevice+0x52c/0x710 [ 1504.790320] [ 1504.790320] but task is already holding lock: [ 1504.790321] ffff97d0e47392e8 (&adapter->crit_lock){+.+.}-{4:4}, at: iavf_finish_config+0x37/0x240 [iavf] [ 1504.790330] [ 1504.790330] which lock already depends on the new lock. [ 1504.790330] [ 1504.790330] [ 1504.790330] the existing dependency chain (in reverse order) is: [ 1504.790331] [ 1504.790331] -> #1 (&adapter->crit_lock){+.+.}-{4:4}: [ 1504.790333] __lock_acquire+0x52d/0xbb0 [ 1504.790337] lock_acquire+0xd9/0x330 [ 1504.790338] mutex_lock_nested+0x4b/0xb0 [ 1504.790341] iavf_finish_config+0x37/0x240 [iavf] [ 1504.790347] process_one_work+0x248/0x6d0 [ 1504.790350] worker_thread+0x18d/0x330 [ 1504.790352] kthread+0x10e/0x250 [ 1504.790354] ret_from_fork+0x30/0x50 [ 1504.790357] ret_from_fork_asm+0x1a/0x30 [ 1504.790361] [ 1504.790361] -> #0 (&dev->lock){+.+.}-{4:4}: [ 1504.790364] check_prev_add+0xf1/0xce0 [ 1504.790366] validate_chain+0x46a/0x570 [ 1504.790368] __lock_acquire+0x52d/0xbb0 [ 1504.790370] lock_acquire+0xd9/0x330 [ 1504.790371] mutex_lock_nested+0x4b/0xb0 [ 1504.790372] register_netdevice+0x52c/0x710 [ 1504.790374] iavf_finish_config+0xfa/0x240 [iavf] [ 1504.790379] process_one_work+0x248/0x6d0 [ 1504.790381] worker_thread+0x18d/0x330 [ 1504.790383] kthread+0x10e/0x250 [ 1504.790385] ret_from_fork+0x30/0x50 [ 1504.790387] ret_from_fork_asm+0x1a/0x30 [ 1504.790389] [ 1504.790389] other info that might help us debug this: [ 1504.790389] [ 1504.790389] Possible unsafe locking scenario: [ 1504.790389] [ 1504.790390] CPU0 CPU1 [ 1504.790391] ---- ---- [ 1504.790391] lock(&adapter->crit_lock); [ 1504.790393] lock(&dev->lock); [ 1504.790394] lock(&adapter->crit_lock); [ 1504.790395] lock(&dev->lock); [ 1504.790397] [ 1504.790397] *** DEADLOCK *** This appears to be caused by the change in commit 5fda3f35349b ("net: make netdev_lock() protect netdev->reg_state"), which added a netdev_lock() in register_netdevice. The iAVF driver calls register_netdevice() from iavf_finish_config(), as a final stage of its state machine post-probe. It currently takes the RTNL lock, then the netdev lock, and then the device critical lock. This pattern is used throughout the driver. Thus there is a strong dependency that the crit_lock should not be acquired before the net device lock. The change to register_netdevice creates an ABBA lock order violation because the iAVF driver is holding the crit_lock while calling register_netdevice, which then takes the netdev_lock. It seems likely that future refactors could result in netdev APIs which hold the netdev_lock while calling into the driver. This means that we should not re-order the locks so that netdev_lock is acquired after the device private crit_lock. Instead, notice that we already release the netdev_lock prior to calling the register_netdevice. This flow only happens during the early driver initialization as we transition through the __IAVF_STARTUP, __IAVF_INIT_VERSION_CHECK, __IAVF_INIT_GET_RESOURCES, etc. Analyzing the places where we take crit_lock in the driver there are two sources: a) several of the work queue tasks including adminq_task, watchdog_task, reset_task, and the finish_config task. b) various callbacks which ultimately stem back to .ndo operations or ethtool operations. The latter cannot be triggered until after the netdevice registration is completed successfully. The iAVF driver uses alloc_ordered_workqueue, which is an unbound workqueue that has a max limit of 1, and thus guarantees that only a single work item on the queue is executing at any given time, so none of the other work threads could be executing due to the ordered workqueue guarantees. The iavf_finish_config() function also does not do anything else after register_netdevice, unless it fails. It seems unlikely that the driver private crit_lock is protecting anything that register_netdevice() itself touches. Thus, to fix this ABBA lock violation, lets simply release the adapter->crit_lock as well as netdev_lock prior to calling register_netdevice(). We do still keep holding the RTNL lock as required by the function. If we do fail to register the netdevice, then we re-acquire the adapter critical lock to finish the transition back to __IAVF_INIT_CONFIG_ADAPTER. This ensures every call where both netdev_lock and the adapter->crit_lock are acquired under the same ordering. Fixes: afc664987ab3 ("eth: iavf: extend the netdev_lock usage") Signed-off-by: Jacob Keller Tested-by: Przemek Kitszel Reviewed-by: Przemek Kitszel Reviewed-by: Jakub Kicinski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250224190647.3601930-5-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/iavf/iavf_main.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 852e5b62f0a5d..6faa62bced3a2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1983,7 +1983,7 @@ static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter, bool runni static void iavf_finish_config(struct work_struct *work) { struct iavf_adapter *adapter; - bool netdev_released = false; + bool locks_released = false; int pairs, err; adapter = container_of(work, struct iavf_adapter, finish_config); @@ -2012,19 +2012,22 @@ static void iavf_finish_config(struct work_struct *work) netif_set_real_num_tx_queues(adapter->netdev, pairs); if (adapter->netdev->reg_state != NETREG_REGISTERED) { + mutex_unlock(&adapter->crit_lock); netdev_unlock(adapter->netdev); - netdev_released = true; + locks_released = true; err = register_netdevice(adapter->netdev); if (err) { dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n", err); /* go back and try again.*/ + mutex_lock(&adapter->crit_lock); iavf_free_rss(adapter); iavf_free_misc_irq(adapter); iavf_reset_interrupt_capability(adapter); iavf_change_state(adapter, __IAVF_INIT_CONFIG_ADAPTER); + mutex_unlock(&adapter->crit_lock); goto out; } } @@ -2040,9 +2043,10 @@ static void iavf_finish_config(struct work_struct *work) } out: - mutex_unlock(&adapter->crit_lock); - if (!netdev_released) + if (!locks_released) { + mutex_unlock(&adapter->crit_lock); netdev_unlock(adapter->netdev); + } rtnl_unlock(); } -- GitLab From b1e44b4aecb551727a368df5b85c535f2ce932ea Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Mon, 24 Feb 2025 11:06:45 -0800 Subject: [PATCH 915/989] ixgbe: fix media cage present detection for E610 device The commit 23c0e5a16bcc ("ixgbe: Add link management support for E610 device") introduced incorrect checking of media cage presence for E610 device. Fix it. Fixes: 23c0e5a16bcc ("ixgbe: Add link management support for E610 device") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/e7d73b32-f12a-49d1-8b60-1ef83359ec13@stanley.mountain/ Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Signed-off-by: Piotr Kwapulinski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250224190647.3601930-6-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 683c668672d65..cb07ecd8937d3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1122,7 +1122,7 @@ static bool ixgbe_is_media_cage_present(struct ixgbe_hw *hw) * returns error (ENOENT), then no cage present. If no cage present then * connection type is backplane or BASE-T. */ - return ixgbe_aci_get_netlist_node(hw, cmd, NULL, NULL); + return !ixgbe_aci_get_netlist_node(hw, cmd, NULL, NULL); } /** -- GitLab From 39ab773e4c120f7f98d759415ccc2aca706bbc10 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:44 +0800 Subject: [PATCH 916/989] net: enetc: fix the off-by-one issue in enetc_map_tx_buffs() When a DMA mapping error occurs while processing skb frags, it will free one more tx_swbd than expected, so fix this off-by-one issue. Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers") Cc: stable@vger.kernel.org Suggested-by: Vladimir Oltean Suggested-by: Michal Swiatkowski Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Reviewed-by: Claudiu Manoil Link: https://patch.msgid.link/20250224111251.1061098-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 26 ++++++++++++++------ 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 6a6fc819dfdee..55ad31a5073e7 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -167,6 +167,24 @@ static bool enetc_skb_is_tcp(struct sk_buff *skb) return skb->csum_offset == offsetof(struct tcphdr, check); } +/** + * enetc_unwind_tx_frame() - Unwind the DMA mappings of a multi-buffer Tx frame + * @tx_ring: Pointer to the Tx ring on which the buffer descriptors are located + * @count: Number of Tx buffer descriptors which need to be unmapped + * @i: Index of the last successfully mapped Tx buffer descriptor + */ +static void enetc_unwind_tx_frame(struct enetc_bdr *tx_ring, int count, int i) +{ + while (count--) { + struct enetc_tx_swbd *tx_swbd = &tx_ring->tx_swbd[i]; + + enetc_free_tx_frame(tx_ring, tx_swbd); + if (i == 0) + i = tx_ring->bd_count; + i--; + } +} + static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) { bool do_vlan, do_onestep_tstamp = false, do_twostep_tstamp = false; @@ -372,13 +390,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) dma_err: dev_err(tx_ring->dev, "DMA map error"); - do { - tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_frame(tx_ring, tx_swbd); - if (i == 0) - i = tx_ring->bd_count; - i--; - } while (count--); + enetc_unwind_tx_frame(tx_ring, count, i); return 0; } -- GitLab From da291996b16ebd10626d4b20288327b743aff110 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:45 +0800 Subject: [PATCH 917/989] net: enetc: keep track of correct Tx BD count in enetc_map_tx_tso_buffs() When creating a TSO header, if the skb is VLAN tagged, the extended BD will be used and the 'count' should be increased by 2 instead of 1. Otherwise, when an error occurs, less tx_swbd will be freed than the actual number. Fixes: fb8629e2cbfc ("net: enetc: add support for software TSO") Cc: stable@vger.kernel.org Suggested-by: Vladimir Oltean Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Reviewed-by: Claudiu Manoil Link: https://patch.msgid.link/20250224111251.1061098-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 55ad31a5073e7..174db9e2ce813 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -395,14 +395,15 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) return 0; } -static void enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, - struct enetc_tx_swbd *tx_swbd, - union enetc_tx_bd *txbd, int *i, int hdr_len, - int data_len) +static int enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, + struct enetc_tx_swbd *tx_swbd, + union enetc_tx_bd *txbd, int *i, int hdr_len, + int data_len) { union enetc_tx_bd txbd_tmp; u8 flags = 0, e_flags = 0; dma_addr_t addr; + int count = 1; enetc_clear_tx_bd(&txbd_tmp); addr = tx_ring->tso_headers_dma + *i * TSO_HEADER_SIZE; @@ -445,7 +446,10 @@ static void enetc_map_tx_tso_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb, /* Write the BD */ txbd_tmp.ext.e_flags = e_flags; *txbd = txbd_tmp; + count++; } + + return count; } static int enetc_map_tx_tso_data(struct enetc_bdr *tx_ring, struct sk_buff *skb, @@ -802,9 +806,9 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb /* compute the csum over the L4 header */ csum = enetc_tso_hdr_csum(&tso, skb, hdr, hdr_len, &pos); - enetc_map_tx_tso_hdr(tx_ring, skb, tx_swbd, txbd, &i, hdr_len, data_len); + count += enetc_map_tx_tso_hdr(tx_ring, skb, tx_swbd, txbd, + &i, hdr_len, data_len); bd_data_num = 0; - count++; while (data_len > 0) { int size; -- GitLab From 432a2cb3ee97a7c6ea578888fe81baad035b9307 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:46 +0800 Subject: [PATCH 918/989] net: enetc: correct the xdp_tx statistics The 'xdp_tx' is used to count the number of XDP_TX frames sent, not the number of Tx BDs. Fixes: 7ed2bc80074e ("net: enetc: add support for XDP_TX") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Ioana Ciornei Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250224111251.1061098-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 174db9e2ce813..3cb9ebb13b19b 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1917,7 +1917,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, enetc_xdp_drop(rx_ring, orig_i, i); tx_ring->stats.xdp_tx_drops++; } else { - tx_ring->stats.xdp_tx += xdp_tx_bd_cnt; + tx_ring->stats.xdp_tx++; rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; xdp_tx_frm_cnt++; /* The XDP_TX enqueue was successful, so we -- GitLab From a562d0c4a893eae3ea51d512c4d90ab858a6b7ec Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:47 +0800 Subject: [PATCH 919/989] net: enetc: VFs do not support HWTSTAMP_TX_ONESTEP_SYNC Actually ENETC VFs do not support HWTSTAMP_TX_ONESTEP_SYNC because only ENETC PF can access PMa_SINGLE_STEP registers. And there will be a crash if VFs are used to test one-step timestamp, the crash log as follows. [ 129.110909] Unable to handle kernel paging request at virtual address 00000000000080c0 [ 129.287769] Call trace: [ 129.290219] enetc_port_mac_wr+0x30/0xec (P) [ 129.294504] enetc_start_xmit+0xda4/0xe74 [ 129.298525] enetc_xmit+0x70/0xec [ 129.301848] dev_hard_start_xmit+0x98/0x118 Fixes: 41514737ecaa ("enetc: add get_ts_info interface for ethtool") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20250224111251.1061098-5-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 3 +++ drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 3cb9ebb13b19b..e946d86527904 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -3244,6 +3244,9 @@ static int enetc_hwtstamp_set(struct net_device *ndev, struct ifreq *ifr) new_offloads |= ENETC_F_TX_TSTAMP; break; case HWTSTAMP_TX_ONESTEP_SYNC: + if (!enetc_si_is_pf(priv->si)) + return -EOPNOTSUPP; + new_offloads &= ~ENETC_F_TX_TSTAMP_MASK; new_offloads |= ENETC_F_TX_ONESTEP_SYNC_TSTAMP; break; diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index bf34b5bb1e358..ece3ae28ba827 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -832,6 +832,7 @@ static int enetc_set_coalesce(struct net_device *ndev, static int enetc_get_ts_info(struct net_device *ndev, struct kernel_ethtool_ts_info *info) { + struct enetc_ndev_priv *priv = netdev_priv(ndev); int *phc_idx; phc_idx = symbol_get(enetc_phc_index); @@ -852,8 +853,10 @@ static int enetc_get_ts_info(struct net_device *ndev, SOF_TIMESTAMPING_TX_SOFTWARE; info->tx_types = (1 << HWTSTAMP_TX_OFF) | - (1 << HWTSTAMP_TX_ON) | - (1 << HWTSTAMP_TX_ONESTEP_SYNC); + (1 << HWTSTAMP_TX_ON); + + if (enetc_si_is_pf(priv->si)) + info->tx_types |= (1 << HWTSTAMP_TX_ONESTEP_SYNC); info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) | (1 << HWTSTAMP_FILTER_ALL); -- GitLab From bbcbc906ab7b5834c1219cd17a38d78dba904aa0 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:48 +0800 Subject: [PATCH 920/989] net: enetc: update UDP checksum when updating originTimestamp field There is an issue with one-step timestamp based on UDP/IP. The peer will discard the sync packet because of the wrong UDP checksum. For ENETC v1, the software needs to update the UDP checksum when updating the originTimestamp field, so that the hardware can correctly update the UDP checksum when updating the correction field. Otherwise, the UDP checksum in the sync packet will be wrong. Fixes: 7294380c5211 ("enetc: support PTP Sync packet one-step timestamping") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20250224111251.1061098-6-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 41 ++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index e946d86527904..9801c51b6a590 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -297,9 +297,11 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) } if (do_onestep_tstamp) { - u32 lo, hi, val; - u64 sec, nsec; + __be32 new_sec_l, new_nsec; + u32 lo, hi, nsec, val; + __be16 new_sec_h; u8 *data; + u64 sec; lo = enetc_rd_hot(hw, ENETC_SICTR0); hi = enetc_rd_hot(hw, ENETC_SICTR1); @@ -313,13 +315,38 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb) /* Update originTimestamp field of Sync packet * - 48 bits seconds field * - 32 bits nanseconds field + * + * In addition, the UDP checksum needs to be updated + * by software after updating originTimestamp field, + * otherwise the hardware will calculate the wrong + * checksum when updating the correction field and + * update it to the packet. */ data = skb_mac_header(skb); - *(__be16 *)(data + offset2) = - htons((sec >> 32) & 0xffff); - *(__be32 *)(data + offset2 + 2) = - htonl(sec & 0xffffffff); - *(__be32 *)(data + offset2 + 6) = htonl(nsec); + new_sec_h = htons((sec >> 32) & 0xffff); + new_sec_l = htonl(sec & 0xffffffff); + new_nsec = htonl(nsec); + if (udp) { + struct udphdr *uh = udp_hdr(skb); + __be32 old_sec_l, old_nsec; + __be16 old_sec_h; + + old_sec_h = *(__be16 *)(data + offset2); + inet_proto_csum_replace2(&uh->check, skb, old_sec_h, + new_sec_h, false); + + old_sec_l = *(__be32 *)(data + offset2 + 2); + inet_proto_csum_replace4(&uh->check, skb, old_sec_l, + new_sec_l, false); + + old_nsec = *(__be32 *)(data + offset2 + 6); + inet_proto_csum_replace4(&uh->check, skb, old_nsec, + new_nsec, false); + } + + *(__be16 *)(data + offset2) = new_sec_h; + *(__be32 *)(data + offset2 + 2) = new_sec_l; + *(__be32 *)(data + offset2 + 6) = new_nsec; /* Configure single-step register */ val = ENETC_PM0_SINGLE_STEP_EN; -- GitLab From 8e43decdfbb477dd7800e3902d2d2f105d22ef5f Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:49 +0800 Subject: [PATCH 921/989] net: enetc: add missing enetc4_link_deinit() The enetc4_link_init() is called when the PF driver probes to create phylink and MDIO bus, but we forgot to call enetc4_link_deinit() to free the phylink and MDIO bus when the driver was unbound. so add missing enetc4_link_deinit() to enetc4_pf_netdev_destroy(). Fixes: 99100d0d9922 ("net: enetc: add preliminary support for i.MX95 ENETC PF") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250224111251.1061098-7-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc4_pf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c index fc41078c4f5da..48861c8b499a0 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c @@ -684,6 +684,7 @@ static void enetc4_pf_netdev_destroy(struct enetc_si *si) struct net_device *ndev = si->ndev; unregister_netdev(ndev); + enetc4_link_deinit(priv); enetc_free_msix(priv); free_netdev(ndev); } -- GitLab From 119049b66b883c7e7e575a0b69dc6e3d211662cc Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:50 +0800 Subject: [PATCH 922/989] net: enetc: remove the mm_lock from the ENETC v4 driver Currently, the ENETC v4 driver has not added the MAC merge layer support in the upstream, so the mm_lock is not initialized and used, so remove the mm_lock from the driver. Fixes: 99100d0d9922 ("net: enetc: add preliminary support for i.MX95 ENETC PF") Cc: stable@vger.kernel.org Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250224111251.1061098-8-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc4_pf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c index 48861c8b499a0..73ac8c6afb3ad 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc4_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc4_pf.c @@ -672,7 +672,6 @@ static int enetc4_pf_netdev_create(struct enetc_si *si) err_alloc_msix: err_config_si: err_clk_get: - mutex_destroy(&priv->mm_lock); free_netdev(ndev); return err; -- GitLab From 249df695c3ffe8c8d36d46c2580ce72410976f96 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Mon, 24 Feb 2025 19:12:51 +0800 Subject: [PATCH 923/989] net: enetc: fix the off-by-one issue in enetc_map_tx_tso_buffs() There is an off-by-one issue for the err_chained_bd path, it will free one more tx_swbd than expected. But there is no such issue for the err_map_data path. To fix this off-by-one issue and make the two error handling consistent, the increment of 'i' and 'count' remain in sync and enetc_unwind_tx_frame() is called for error handling. Fixes: fb8629e2cbfc ("net: enetc: add support for software TSO") Cc: stable@vger.kernel.org Suggested-by: Vladimir Oltean Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Reviewed-by: Claudiu Manoil Link: https://patch.msgid.link/20250224111251.1061098-9-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9801c51b6a590..2106861463e40 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -859,8 +859,13 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb err = enetc_map_tx_tso_data(tx_ring, skb, tx_swbd, txbd, tso.data, size, size == data_len); - if (err) + if (err) { + if (i == 0) + i = tx_ring->bd_count; + i--; + goto err_map_data; + } data_len -= size; count++; @@ -889,13 +894,7 @@ static int enetc_map_tx_tso_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb dev_err(tx_ring->dev, "DMA map error"); err_chained_bd: - do { - tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_frame(tx_ring, tx_swbd); - if (i == 0) - i = tx_ring->bd_count; - i--; - } while (count--); + enetc_unwind_tx_frame(tx_ring, count, i); return 0; } -- GitLab From 8d52da23b6c68a0f6bad83959ebb61a2cf623c4e Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 24 Feb 2025 17:00:47 +0800 Subject: [PATCH 924/989] tcp: Defer ts_recent changes until req is owned Recently a bug was discovered where the server had entered TCP_ESTABLISHED state, but the upper layers were not notified. The same 5-tuple packet may be processed by different CPUSs, so two CPUs may receive different ack packets at the same time when the state is TCP_NEW_SYN_RECV. In that case, req->ts_recent in tcp_check_req may be changed concurrently, which will probably cause the newsk's ts_recent to be incorrectly large. So that tcp_validate_incoming will fail. At this point, newsk will not be able to enter the TCP_ESTABLISHED. cpu1 cpu2 tcp_check_req tcp_check_req req->ts_recent = rcv_tsval = t1 req->ts_recent = rcv_tsval = t2 syn_recv_sock tcp_sk(child)->rx_opt.ts_recent = req->ts_recent = t2 // t1 < t2 tcp_child_process tcp_rcv_state_process tcp_validate_incoming tcp_paws_check if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) // t2 - t1 > paws_win, failed tcp_v4_do_rcv tcp_rcv_state_process // TCP_ESTABLISHED The cpu2's skb or a newly received skb will call tcp_v4_do_rcv to get the newsk into the TCP_ESTABLISHED state, but at this point it is no longer possible to notify the upper layer application. A notification mechanism could be added here, but the fix is more complex, so the current fix is used. In tcp_check_req, req->ts_recent is used to assign a value to tcp_sk(child)->rx_opt.ts_recent, so removing the change in req->ts_recent and changing tcp_sk(child)->rx_opt.ts_recent directly after owning the req fixes this bug. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Wang Hai Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b089b08e96178..dfdb7a4608a85 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -815,12 +815,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - /* TODO: We probably should defer ts_recent change once - * we take ownership of @req. - */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -869,6 +863,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); -- GitLab From 17bcd714426386fda741a4bccd96a2870179344b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 15:55:36 -0800 Subject: [PATCH 925/989] KVM: x86: Free vCPUs before freeing VM state Free vCPUs before freeing any VM state, as both SVM and VMX may access VM state when "freeing" a vCPU that is currently "in" L2, i.e. that needs to be kicked out of nested guest mode. Commit 6fcee03df6a1 ("KVM: x86: avoid loading a vCPU after .vm_destroy was called") partially fixed the issue, but for unknown reasons only moved the MMU unloading before VM destruction. Complete the change, and free all vCPU state prior to destroying VM state, as nVMX accesses even more state than nSVM. In addition to the AVIC, KVM can hit a use-after-free on MSR filters: kvm_msr_allowed+0x4c/0xd0 __kvm_set_msr+0x12d/0x1e0 kvm_set_msr+0x19/0x40 load_vmcs12_host_state+0x2d8/0x6e0 [kvm_intel] nested_vmx_vmexit+0x715/0xbd0 [kvm_intel] nested_vmx_free_vcpu+0x33/0x50 [kvm_intel] vmx_free_vcpu+0x54/0xc0 [kvm_intel] kvm_arch_vcpu_destroy+0x28/0xf0 kvm_vcpu_destroy+0x12/0x50 kvm_arch_destroy_vm+0x12c/0x1c0 kvm_put_kvm+0x263/0x3c0 kvm_vm_release+0x21/0x30 and an upcoming fix to process injectable interrupts on nested VM-Exit will access the PIC: BUG: kernel NULL pointer dereference, address: 0000000000000090 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page CPU: 23 UID: 1000 PID: 2658 Comm: kvm-nx-lpage-re RIP: 0010:kvm_cpu_has_extint+0x2f/0x60 [kvm] Call Trace: kvm_cpu_has_injectable_intr+0xe/0x60 [kvm] nested_vmx_vmexit+0x2d7/0xdf0 [kvm_intel] nested_vmx_free_vcpu+0x40/0x50 [kvm_intel] vmx_vcpu_free+0x2d/0x80 [kvm_intel] kvm_arch_vcpu_destroy+0x2d/0x130 [kvm] kvm_destroy_vcpus+0x8a/0x100 [kvm] kvm_arch_destroy_vm+0xa7/0x1d0 [kvm] kvm_destroy_vm+0x172/0x300 [kvm] kvm_vcpu_release+0x31/0x50 [kvm] Inarguably, both nSVM and nVMX need to be fixed, but punt on those cleanups for the moment. Conceptually, vCPUs should be freed before VM state. Assets like the I/O APIC and PIC _must_ be allocated before vCPUs are created, so it stands to reason that they must be freed _after_ vCPUs are destroyed. Reported-by: Aaron Lewis Closes: https://lore.kernel.org/all/20240703175618.2304869-2-aaronlewis@google.com Cc: Jim Mattson Cc: Yan Zhao Cc: Rick P Edgecombe Cc: Kai Huang Cc: Isaku Yamahata Signed-off-by: Sean Christopherson Message-ID: <20250224235542.2562848-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02159c967d29e..6fc4ddc606bd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12877,11 +12877,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } kvm_unload_vcpu_mmus(kvm); + kvm_destroy_vcpus(kvm); kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); -- GitLab From 982caaa1150479f022003390cd72a1941663d211 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 15:55:37 -0800 Subject: [PATCH 926/989] KVM: nVMX: Process events on nested VM-Exit if injectable IRQ or NMI is pending Process pending events on nested VM-Exit if the vCPU has an injectable IRQ or NMI, as the event may have become pending while L2 was active, i.e. may not be tracked in the context of vmcs01. E.g. if L1 has passed its APIC through to L2 and an IRQ arrives while L2 is active, then KVM needs to request an IRQ window prior to running L1, otherwise delivery of the IRQ will be delayed until KVM happens to process events for some other reason. The missed failure is detected by vmx_apic_passthrough_tpr_threshold_test in KVM-Unit-Tests, but has effectively been masked due to a flaw in KVM's PIC emulation that causes KVM to make spurious KVM_REQ_EVENT requests (and apparently no one ever ran the test with split IRQ chips). Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250224235542.2562848-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8a7af02d466e9..ed8a3cb539612 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5084,6 +5084,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } -- GitLab From 2e064e3f3282ec016d80cb7b1fadff0d8e2014ca Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Feb 2025 19:23:50 +0900 Subject: [PATCH 927/989] drm/imagination: remove unnecessary header include path drivers/gpu/drm/imagination/ includes local headers with the double-quote form (#include "..."). Hence, the header search path addition is unneeded. Signed-off-by: Masahiro Yamada Reviewed-by: Matt Coster Link: https://patchwork.freedesktop.org/patch/msgid/20250210102352.1517115-1-masahiroy@kernel.org Signed-off-by: Matt Coster --- drivers/gpu/drm/imagination/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/imagination/Makefile b/drivers/gpu/drm/imagination/Makefile index 9bc6a3884c223..3d9d4d40fb806 100644 --- a/drivers/gpu/drm/imagination/Makefile +++ b/drivers/gpu/drm/imagination/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only OR MIT # Copyright (c) 2023 Imagination Technologies Ltd. -subdir-ccflags-y := -I$(src) - powervr-y := \ pvr_ccb.o \ pvr_cccb.o \ -- GitLab From 130ff5c8b78e6fd05270a04985c50bce6a3de6c1 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 25 Feb 2025 15:16:12 +0100 Subject: [PATCH 928/989] ata: ahci: Make ahci_ignore_port() handle empty mask_port_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8c87215dd3a2 ("ata: libahci_platform: support non-consecutive port numbers") added a skip to ahci_platform_enable_phys() for ports that are not in mask_port_map. The code in ahci_platform_get_resources(), will currently set mask_port_map for each child "port" node it finds in the device tree. However, device trees that do not have any child "port" nodes will not have mask_port_map set, and for non-device tree platforms mask_port_map will only exist as a quirk for specific PCI device + vendor IDs, or as a kernel module parameter, but will not be set by default. Therefore, the common thing is that mask_port_map is only set if you do not want to use all ports (as defined by Offset 0Ch: PI – Ports Implemented register), but instead only want to use the ports in mask_port_map. If mask_port_map is not set, all ports are available. Thus, ahci_ignore_port() must be able to handle an empty mask_port_map. Fixes: 8c87215dd3a2 ("ata: libahci_platform: support non-consecutive port numbers") Fixes: 2c202e6c4f4d ("ata: libahci_platform: Do not set mask_port_map when not needed") Fixes: c9b5be909e65 ("ahci: Introduce ahci_ignore_port() helper") Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/linux-ide/10b31dd0-d0bb-4f76-9305-2195c3e17670@samsung.com/ Tested-by: Marek Szyprowski Co-developed-by: Damien Le Moal Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20250225141612.942170-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/ahci.h | 8 ++++++-- drivers/ata/libahci.c | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index aea30df50c581..b2e0ef4efbdc3 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -386,8 +386,12 @@ struct ahci_host_priv { static inline bool ahci_ignore_port(struct ahci_host_priv *hpriv, unsigned int portid) { - return portid >= hpriv->nports || - !(hpriv->mask_port_map & (1 << portid)); + if (portid >= hpriv->nports) + return true; + /* mask_port_map not set means that all ports are available */ + if (!hpriv->mask_port_map) + return false; + return !(hpriv->mask_port_map & (1 << portid)); } extern int ahci_ignore_sss; diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fdfa7b2662180..e7ace4b10f15b 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -541,6 +541,7 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv) hpriv->saved_port_map = port_map; } + /* mask_port_map not set means that all ports are available */ if (hpriv->mask_port_map) { dev_warn(dev, "masking port_map 0x%lx -> 0x%lx\n", port_map, -- GitLab From f2ba0cf1ca32e075617813de98c826ab55d57f11 Mon Sep 17 00:00:00 2001 From: Mingcong Bai Date: Tue, 25 Feb 2025 15:31:01 +0800 Subject: [PATCH 929/989] drm/xe/regs: remove a duplicate definition for RING_CTL_SIZE(size) Commit b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h") introduced an internal set of engine registers, however, as part of this change, it has also introduced two duplicate `define' lines for `RING_CTL_SIZE(size)'. This commit was introduced to the tree in v6.8-rc1. While this is harmless as the definitions did not change, so no compiler warning was observed. Drop this line anyway for the sake of correctness. Cc: stable@vger.kernel.org # v6.8-rc1+ Fixes: b79e8fd954c4 ("drm/xe: Remove dependency on intel_engine_regs.h") Signed-off-by: Mingcong Bai Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250225073104.865230-1-jeffbai@aosc.io Signed-off-by: Rodrigo Vivi (cherry picked from commit 6b68c4542ffecc36087a9e14db8fc990c88bb01b) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index d86219dedde2a..b732c89816dff 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -53,7 +53,6 @@ #define RING_CTL(base) XE_REG((base) + 0x3c) #define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ -#define RING_CTL_SIZE(size) ((size) - PAGE_SIZE) /* in bytes -> pages */ #define RING_START_UDW(base) XE_REG((base) + 0x48) -- GitLab From 12c2f962fe71f390951d9242725bc7e608f55927 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Tue, 25 Feb 2025 10:27:54 +0530 Subject: [PATCH 930/989] drm/xe: cancel pending job timer before freeing scheduler The async call to __guc_exec_queue_fini_async frees the scheduler while a submission may time out and restart. To prevent this race condition, the pending job timer should be canceled before freeing the scheduler. V3(MattB): - Adjust position of cancel pending job - Remove gitlab issue# from commit message V2(MattB): - Cancel pending jobs before scheduler finish Fixes: a20c75dba192 ("drm/xe: Call __guc_exec_queue_fini_async direct for KERNEL exec_queues") Reviewed-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20250225045754.600905-1-tejas.upadhyay@intel.com Signed-off-by: Tejas Upadhyay (cherry picked from commit 18fbd567e75f9b97b699b2ab4f1fa76b7cf268f6) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_guc_submit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 913c74d6e2aeb..b6a2dd742ebdc 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1248,6 +1248,8 @@ static void __guc_exec_queue_fini_async(struct work_struct *w) if (xe_exec_queue_is_lr(q)) cancel_work_sync(&ge->lr_tdr); + /* Confirm no work left behind accessing device structures */ + cancel_delayed_work_sync(&ge->sched.base.work_tdr); release_guc_id(guc, q); xe_sched_entity_fini(&ge->entity); xe_sched_fini(&ge->sched); -- GitLab From 16fef33fdb1e2269c20697d9b61ae8022bc92665 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 24 Feb 2025 11:32:42 +0200 Subject: [PATCH 931/989] drm/i915/dp_mst: Fix encoder HW state readout for UHBR MST The encoder HW/SW state verification should use a SW state which stays unchanged while the encoder/output is active. The intel_dp::is_mst flag used during state computation to choose between the DP SST/MST modes can change while the output is active, if the sink gets disconnected or the MST topology is removed for another reason. A subsequent state verification using intel_dp::is_mst leads then to a mismatch if the output is disabled/re-enabled without recomputing its state. Use the encoder's active MST link count instead, which will be always non-zero for an active MST output and will be zero for SST. Fixes: 35d2e4b75649 ("drm/i915/ddi: start distinguishing 128b/132b SST and MST at state readout") Fixes: 40d489fac0e8 ("drm/i915/ddi: handle 128b/132b SST in intel_ddi_read_func_ctl()") Cc: Jani Nikula Reviewed-by: Jani Nikula Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20250224093242.1859583-1-imre.deak@intel.com (cherry picked from commit 0159e311772af9d6598aafe072c020687720f1d7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_ddi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 18c66992aa1d8..ff2cf3daa7a2b 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -866,7 +866,7 @@ static void intel_ddi_get_encoder_pipes(struct intel_encoder *encoder, encoder->base.base.id, encoder->base.name); if (!mst_pipe_mask && dp128b132b_pipe_mask) { - struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + struct intel_digital_port *dig_port = enc_to_dig_port(encoder); /* * If we don't have 8b/10b MST, but have more than one @@ -878,7 +878,8 @@ static void intel_ddi_get_encoder_pipes(struct intel_encoder *encoder, * we don't expect MST to have been enabled at that point, and * can assume it's SST. */ - if (hweight8(dp128b132b_pipe_mask) > 1 || intel_dp->is_mst) + if (hweight8(dp128b132b_pipe_mask) > 1 || + intel_dp_mst_encoder_active_links(dig_port)) mst_pipe_mask = dp128b132b_pipe_mask; } @@ -4151,13 +4152,13 @@ static void intel_ddi_read_func_ctl(struct intel_encoder *encoder, } else if (ddi_mode == TRANS_DDI_MODE_SELECT_DP_MST) { intel_ddi_read_func_ctl_dp_mst(encoder, pipe_config, ddi_func_ctl); } else if (ddi_mode == TRANS_DDI_MODE_SELECT_FDI_OR_128B132B && HAS_DP20(display)) { - struct intel_dp *intel_dp = enc_to_intel_dp(encoder); + struct intel_digital_port *dig_port = enc_to_dig_port(encoder); /* * If this is true, we know we're being called from mst stream * encoder's ->get_config(). */ - if (intel_dp->is_mst) + if (intel_dp_mst_encoder_active_links(dig_port)) intel_ddi_read_func_ctl_dp_mst(encoder, pipe_config, ddi_func_ctl); else intel_ddi_read_func_ctl_dp_sst(encoder, pipe_config, ddi_func_ctl); -- GitLab From 01f1d77a2630e774ce33233c4e6723bca3ae9daa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 14 Jan 2025 10:57:25 +0100 Subject: [PATCH 932/989] drm/nouveau: Do not override forced connector status Keep user-forced connector status even if it cannot be programmed. Same behavior as for the rest of the drivers. Signed-off-by: Thomas Zimmermann Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20250114100214.195386-1-tzimmermann@suse.de --- drivers/gpu/drm/nouveau/nouveau_connector.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c index 8d5c9c74cbb90..eac0d1d2dbda2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_connector.c +++ b/drivers/gpu/drm/nouveau/nouveau_connector.c @@ -775,7 +775,6 @@ nouveau_connector_force(struct drm_connector *connector) if (!nv_encoder) { NV_ERROR(drm, "can't find encoder to force %s on!\n", connector->name); - connector->status = connector_status_disconnected; return; } -- GitLab From dd1998e243f5fa25d348a384ba0b6c84d980f2b2 Mon Sep 17 00:00:00 2001 From: Tyrone Ting Date: Thu, 20 Feb 2025 12:00:29 +0800 Subject: [PATCH 933/989] i2c: npcm: disable interrupt enable bit before devm_request_irq The customer reports that there is a soft lockup issue related to the i2c driver. After checking, the i2c module was doing a tx transfer and the bmc machine reboots in the middle of the i2c transaction, the i2c module keeps the status without being reset. Due to such an i2c module status, the i2c irq handler keeps getting triggered since the i2c irq handler is registered in the kernel booting process after the bmc machine is doing a warm rebooting. The continuous triggering is stopped by the soft lockup watchdog timer. Disable the interrupt enable bit in the i2c module before calling devm_request_irq to fix this issue since the i2c relative status bit is read-only. Here is the soft lockup log. [ 28.176395] watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:1] [ 28.183351] Modules linked in: [ 28.186407] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.15.120-yocto-s-dirty-bbebc78 #1 [ 28.201174] pstate: 40000005 (nZcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 28.208128] pc : __do_softirq+0xb0/0x368 [ 28.212055] lr : __do_softirq+0x70/0x368 [ 28.215972] sp : ffffff8035ebca00 [ 28.219278] x29: ffffff8035ebca00 x28: 0000000000000002 x27: ffffff80071a3780 [ 28.226412] x26: ffffffc008bdc000 x25: ffffffc008bcc640 x24: ffffffc008be50c0 [ 28.233546] x23: ffffffc00800200c x22: 0000000000000000 x21: 000000000000001b [ 28.240679] x20: 0000000000000000 x19: ffffff80001c3200 x18: ffffffffffffffff [ 28.247812] x17: ffffffc02d2e0000 x16: ffffff8035eb8b40 x15: 00001e8480000000 [ 28.254945] x14: 02c3647e37dbfcb6 x13: 02c364f2ab14200c x12: 0000000002c364f2 [ 28.262078] x11: 00000000fa83b2da x10: 000000000000b67e x9 : ffffffc008010250 [ 28.269211] x8 : 000000009d983d00 x7 : 7fffffffffffffff x6 : 0000036d74732434 [ 28.276344] x5 : 00ffffffffffffff x4 : 0000000000000015 x3 : 0000000000000198 [ 28.283476] x2 : ffffffc02d2e0000 x1 : 00000000000000e0 x0 : ffffffc008bdcb40 [ 28.290611] Call trace: [ 28.293052] __do_softirq+0xb0/0x368 [ 28.296625] __irq_exit_rcu+0xe0/0x100 [ 28.300374] irq_exit+0x14/0x20 [ 28.303513] handle_domain_irq+0x68/0x90 [ 28.307440] gic_handle_irq+0x78/0xb0 [ 28.311098] call_on_irq_stack+0x20/0x38 [ 28.315019] do_interrupt_handler+0x54/0x5c [ 28.319199] el1_interrupt+0x2c/0x4c [ 28.322777] el1h_64_irq_handler+0x14/0x20 [ 28.326872] el1h_64_irq+0x74/0x78 [ 28.330269] __setup_irq+0x454/0x780 [ 28.333841] request_threaded_irq+0xd0/0x1b4 [ 28.338107] devm_request_threaded_irq+0x84/0x100 [ 28.342809] npcm_i2c_probe_bus+0x188/0x3d0 [ 28.346990] platform_probe+0x6c/0xc4 [ 28.350653] really_probe+0xcc/0x45c [ 28.354227] __driver_probe_device+0x8c/0x160 [ 28.358578] driver_probe_device+0x44/0xe0 [ 28.362670] __driver_attach+0x124/0x1d0 [ 28.366589] bus_for_each_dev+0x7c/0xe0 [ 28.370426] driver_attach+0x28/0x30 [ 28.373997] bus_add_driver+0x124/0x240 [ 28.377830] driver_register+0x7c/0x124 [ 28.381662] __platform_driver_register+0x2c/0x34 [ 28.386362] npcm_i2c_init+0x3c/0x5c [ 28.389937] do_one_initcall+0x74/0x230 [ 28.393768] kernel_init_freeable+0x24c/0x2b4 [ 28.398126] kernel_init+0x28/0x130 [ 28.401614] ret_from_fork+0x10/0x20 [ 28.405189] Kernel panic - not syncing: softlockup: hung tasks [ 28.411011] SMP: stopping secondary CPUs [ 28.414933] Kernel Offset: disabled [ 28.418412] CPU features: 0x00000000,00000802 [ 28.427644] Rebooting in 20 seconds.. Fixes: 56a1485b102e ("i2c: npcm7xx: Add Nuvoton NPCM I2C controller driver") Signed-off-by: Tyrone Ting Cc: # v5.8+ Reviewed-by: Tali Perry Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250220040029.27596-2-kfting@nuvoton.com --- drivers/i2c/busses/i2c-npcm7xx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 3ca08b8ef8af3..de713b5747fe5 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -2554,6 +2554,13 @@ static int npcm_i2c_probe_bus(struct platform_device *pdev) if (irq < 0) return irq; + /* + * Disable the interrupt to avoid the interrupt handler being triggered + * incorrectly by the asynchronous interrupt status since the machine + * might do a warm reset during the last smbus/i2c transfer session. + */ + npcm_i2c_int_enable(bus, false); + ret = devm_request_irq(bus->dev, irq, npcm_i2c_bus_irq, 0, dev_name(bus->dev), bus); if (ret) -- GitLab From 71c49ee9bb41e1709abac7e2eb05f9193222e580 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 20 Feb 2025 20:56:12 +0800 Subject: [PATCH 934/989] i2c: ls2x: Fix frequency division register access According to the chip manual, the I2C register access type of Loongson-2K2000/LS7A is "B", so we can only access registers in byte form (readb()/writeb()). Although Loongson-2K0500/Loongson-2K1000 do not have similar constraints, register accesses in byte form also behave correctly. Also, in hardware, the frequency division registers are defined as two separate registers (high 8-bit and low 8-bit), so we just access them directly as bytes. Fixes: 015e61f0bffd ("i2c: ls2x: Add driver for Loongson-2K/LS7A I2C controller") Co-developed-by: Hongliang Wang Signed-off-by: Hongliang Wang Signed-off-by: Binbin Zhou Cc: stable@vger.kernel.org # v6.3+ Reviewed-by: Andy Shevchenko Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250220125612.1910990-1-zhoubinbin@loongson.cn --- drivers/i2c/busses/i2c-ls2x.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-ls2x.c b/drivers/i2c/busses/i2c-ls2x.c index 8821cac3897b6..b475dd27b7af9 100644 --- a/drivers/i2c/busses/i2c-ls2x.c +++ b/drivers/i2c/busses/i2c-ls2x.c @@ -10,6 +10,7 @@ * Rewritten for mainline by Binbin Zhou */ +#include #include #include #include @@ -26,7 +27,8 @@ #include /* I2C Registers */ -#define I2C_LS2X_PRER 0x0 /* Freq Division Register(16 bits) */ +#define I2C_LS2X_PRER_LO 0x0 /* Freq Division Low Byte Register */ +#define I2C_LS2X_PRER_HI 0x1 /* Freq Division High Byte Register */ #define I2C_LS2X_CTR 0x2 /* Control Register */ #define I2C_LS2X_TXR 0x3 /* Transport Data Register */ #define I2C_LS2X_RXR 0x3 /* Receive Data Register */ @@ -93,6 +95,7 @@ static irqreturn_t ls2x_i2c_isr(int this_irq, void *dev_id) */ static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv) { + u16 val; struct i2c_timings *t = &priv->i2c_t; struct device *dev = priv->adapter.dev.parent; u32 acpi_speed = i2c_acpi_find_bus_speed(dev); @@ -104,9 +107,14 @@ static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv) else t->bus_freq_hz = LS2X_I2C_FREQ_STD; - /* Calculate and set i2c frequency. */ - writew(LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1, - priv->base + I2C_LS2X_PRER); + /* + * According to the chip manual, we can only access the registers as bytes, + * otherwise the high bits will be truncated. + * So set the I2C frequency with a sequential writeb() instead of writew(). + */ + val = LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1; + writeb(FIELD_GET(GENMASK(7, 0), val), priv->base + I2C_LS2X_PRER_LO); + writeb(FIELD_GET(GENMASK(15, 8), val), priv->base + I2C_LS2X_PRER_HI); } static void ls2x_i2c_init(struct ls2x_i2c_priv *priv) -- GitLab From 9f3c507cb44498067c980674139bcad56e582ee6 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Wed, 19 Feb 2025 19:27:47 +0530 Subject: [PATCH 935/989] i2c: amd-asf: Fix EOI register write to enable successive interrupts The commit b1f8921dfbaa ("i2c: amd-asf: Clear remote IRR bit to get successive interrupt") introduced a method to enable successive interrupts but inadvertently omitted the necessary write to the EOI register, resulting in a failure to receive successive interrupts. Fix this by adding the required write to the EOI register. Fixes: b1f8921dfbaa ("i2c: amd-asf: Clear remote IRR bit to get successive interrupt") Cc: stable@vger.kernel.org # v6.13+ Co-developed-by: Sanket Goswami Signed-off-by: Sanket Goswami Signed-off-by: Shyam Sundar S K Fixes: 9b25419ad397 ("i2c: amd-asf: Add routine to handle the ASF slave process") Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250219135747.3251182-1-Shyam-sundar.S-k@amd.com --- drivers/i2c/busses/i2c-amd-asf-plat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/busses/i2c-amd-asf-plat.c b/drivers/i2c/busses/i2c-amd-asf-plat.c index 7512614bf4b73..93ebec162c6dd 100644 --- a/drivers/i2c/busses/i2c-amd-asf-plat.c +++ b/drivers/i2c/busses/i2c-amd-asf-plat.c @@ -293,6 +293,7 @@ static irqreturn_t amd_asf_irq_handler(int irq, void *ptr) amd_asf_update_ioport_target(piix4_smba, ASF_SLV_INTR, SMBHSTSTS, true); } + iowrite32(irq, dev->eoi_base); return IRQ_HANDLED; } -- GitLab From 2b1283e1ea9b5e0b06f075f79391a51d9f70749b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 25 Feb 2025 11:46:36 +0000 Subject: [PATCH 936/989] arm64/mm: Fix Boot panic on Ampere Altra When the range of present physical memory is sufficiently small enough and the reserved address space for the linear map is sufficiently large enough, The linear map base address is randomized in arm64_memblock_init(). Prior to commit 62cffa496aac ("arm64/mm: Override PARange for !LPA2 and use it consistently"), we decided if the sizes were suitable with the help of the raw mmfr0.parange. But the commit changed this to use the sanitized version instead. But the function runs before the register has been sanitized so this returns 0, interpreted as a parange of 32 bits. Some fun wrapping occurs and the logic concludes that there is enough room to randomize the linear map base address, when really there isn't. So the top of the linear map ends up outside the reserved address space. Since the PA range cannot be overridden in the first place, restore the mmfr0 reading logic to its state prior to 62cffa496aac, where the raw register value is used. Reported-by: Luiz Capitulino Suggested-by: Ard Biesheuvel Closes: https://lore.kernel.org/all/a3d9acbe-07c2-43b6-9ba9-a7585f770e83@redhat.com/ Fixes: 62cffa496aac ("arm64/mm: Override PARange for !LPA2 and use it consistently") Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250225114638.2038006-1-ryan.roberts@arm.com Cc: stable@vger.kernel.org Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9c0b8d9558fc4..ccdef53872a0b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -279,12 +279,7 @@ void __init arm64_memblock_init(void) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; - - /* - * Use the sanitised version of id_aa64mmfr0_el1 so that linear - * map randomization can be enabled by shrinking the IPA space. - */ - u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); int parange = cpuid_feature_extract_unsigned_field( mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); s64 range = linear_region_size - -- GitLab From 4804f3ac2649475509b1836a4d252c04de143249 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Feb 2025 19:30:38 -0500 Subject: [PATCH 937/989] bcachefs: Revert directory i_size This turned out to have several bugs, which were missed because the fsck code wasn't properly reporting errors - whoops. Kicking it out for now, hopefully it can make 6.15. Cc: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.h | 5 ----- fs/bcachefs/fs-common.c | 11 ----------- fs/bcachefs/fsck.c | 21 --------------------- fs/bcachefs/sb-downgrade.c | 5 +---- 4 files changed, 1 insertion(+), 41 deletions(-) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index a633f83c1ac78..362b3b2f2f2e3 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -31,11 +31,6 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -static inline unsigned int dirent_occupied_size(const struct qstr *name) -{ - return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64); -} - int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index d70d9f634cea9..2c3d46ac70c61 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -152,7 +152,6 @@ int bch2_create_trans(struct btree_trans *trans, if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) @@ -221,7 +220,6 @@ int bch2_link_trans(struct btree_trans *trans, } dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); dir_hash = bch2_hash_info_init(c, dir_u); @@ -324,7 +322,6 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - dir_u->bi_size -= dirent_occupied_size(name); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, @@ -463,14 +460,6 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (mode == BCH_RENAME) { - src_dir_u->bi_size -= dirent_occupied_size(src_name); - dst_dir_u->bi_size += dirent_occupied_size(dst_name); - } - - if (mode == BCH_RENAME_OVERWRITE) - src_dir_u->bi_size -= dirent_occupied_size(src_name); - if (src_inode_u->bi_parent_subvol) src_inode_u->bi_parent_subvol = dst_dir.subvol; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9bf316e7b845d..0e85131d0af88 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1978,31 +1978,10 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ return ret; } -static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - darray_for_each(w->inodes, i) - if (fsck_err_on(i->inode.bi_size != i->i_size, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_size: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { - i->inode.bi_size = i->i_size; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: - check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 35e07bc8fbd34..051214fdc7352 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -90,10 +90,7 @@ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ - BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_directory_size_mismatch) \ + BCH_FSCK_ERR_accounting_key_junk_at_end) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ -- GitLab From 7909d1fb90e290ffd7b8570f4e2f97fe2fb381d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Feb 2025 22:35:28 -0500 Subject: [PATCH 938/989] bcachefs: Check for -BCH_ERR_open_buckets_empty in journal resize This fixes occasional failures from journal resize. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 24c294d4634e0..5dabbf3c0965c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1194,7 +1194,9 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, closure_sync(&cl); - if (ret && ret != -BCH_ERR_bucket_alloc_blocked) + if (ret && + ret != -BCH_ERR_bucket_alloc_blocked && + ret != -BCH_ERR_open_buckets_empty) break; } -- GitLab From 677bdb7346b6fd806ea45b11cbfe36de0b0cd644 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Wed, 26 Feb 2025 17:33:22 +0800 Subject: [PATCH 939/989] bcachefs: Fix deadlock This fixes two deadlocks: 1.pcpu_alloc_mutex involved one as pointed by syzbot[1] 2.recursion deadlock. The root cause is that we hold the bc lock during alloc_percpu, fix it by following the pattern used by __btree_node_mem_alloc(). [1] https://lore.kernel.org/all/66f97d9a.050a0220.6bad9.001d.GAE@google.com/T/ Reported-by: syzbot+fe63f377148a6371a9db@syzkaller.appspotmail.com Tested-by: syzbot+fe63f377148a6371a9db@syzkaller.appspotmail.com Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 9 +++++---- fs/bcachefs/btree_key_cache.c | 2 +- fs/bcachefs/btree_locking.c | 5 +++-- fs/bcachefs/btree_locking.h | 2 +- fs/bcachefs/six.c | 5 +++-- fs/bcachefs/six.h | 7 ++++--- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index ca755e8d1a372..1ec1f90e0eb38 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -203,7 +203,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); __bch2_btree_node_to_freelist(bc, b); return b; @@ -795,17 +795,18 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1821f40c161a1..edce594333756 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -156,7 +156,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k } if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); ck->c.cached = true; goto lock; } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 10b805a60f526..caef65adeae49 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -7,9 +7,10 @@ static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags) + enum six_lock_init_flags flags, + gfp_t gfp) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); lockdep_set_notrack_class(&b->lock); } diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b54ef48eb8cc2..b33ab7af84402 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -13,7 +13,7 @@ #include "btree_iter.h" #include "six.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7e7c66a1e1a6b..7c403427fbdb8 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) EXPORT_SYMBOL_GPL(six_lock_exit); void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp) { atomic_set(&lock->state, 0); raw_spin_lock_init(&lock->wait_lock); @@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, * failure if they wish by checking lock->readers, but generally * will not want to treat it as an error. */ - lock->readers = alloc_percpu(unsigned); + lock->readers = alloc_percpu_gfp(unsigned, gfp); } #endif } diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index c142e06b7a3a7..59b851cf8bacc 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -164,18 +164,19 @@ enum six_lock_init_flags { }; void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ -#define six_lock_init(lock, flags) \ +#define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key, flags); \ + __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** -- GitLab From eb54d2695b57426638fed0ec066ae17a18c4426c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Feb 2025 10:57:26 -0500 Subject: [PATCH 940/989] bcachefs: Fix truncate sometimes failing and returning 1 __bch_truncate_folio() may return 1 to indicate dirtyness of the folio being truncated, needed for fpunch to get the i_size writes correct. But truncate was forgetting to clear ret, and sometimes returning it as an error. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 94bf34b9b65f0..717e7b94c66f8 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -466,6 +466,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); -- GitLab From 01c9c123db76357d4373b2e97b760a856d6fe822 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 25 Feb 2025 13:10:23 -0800 Subject: [PATCH 941/989] net: Use rtnl_net_dev_lock() in register_netdevice_notifier_dev_net(). Breno Leitao reported the splat below. [0] Commit 65161fb544aa ("net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net().") added the DEBUG_NET_WARN_ON_ONCE(), assuming that the netdev is not registered before register_netdevice_notifier_dev_net(). But the assumption was simply wrong. Let's use rtnl_net_dev_lock() in register_netdevice_notifier_dev_net(). [0]: WARNING: CPU: 25 PID: 849 at net/core/dev.c:2150 register_netdevice_notifier_dev_net (net/core/dev.c:2150) ? __warn (kernel/panic.c:242 kernel/panic.c:748) ? register_netdevice_notifier_dev_net (net/core/dev.c:2150) ? register_netdevice_notifier_dev_net (net/core/dev.c:2150) ? report_bug (lib/bug.c:? lib/bug.c:219) ? handle_bug (arch/x86/kernel/traps.c:285) ? exc_invalid_op (arch/x86/kernel/traps.c:309) ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) ? register_netdevice_notifier_dev_net (net/core/dev.c:2150) ? register_netdevice_notifier_dev_net (./include/net/net_namespace.h:406 ./include/linux/netdevice.h:2663 net/core/dev.c:2144) mlx5e_mdev_notifier_event+0x9f/0xf0 mlx5_ib notifier_call_chain.llvm.12241336988804114627 (kernel/notifier.c:85) blocking_notifier_call_chain (kernel/notifier.c:380) mlx5_core_uplink_netdev_event_replay (drivers/net/ethernet/mellanox/mlx5/core/main.c:352) mlx5_ib_roce_init.llvm.12447516292400117075+0x1c6/0x550 mlx5_ib mlx5r_probe+0x375/0x6a0 mlx5_ib ? kernfs_put (./include/linux/instrumented.h:96 ./include/linux/atomic/atomic-arch-fallback.h:2278 ./include/linux/atomic/atomic-instrumented.h:1384 fs/kernfs/dir.c:557) ? auxiliary_match_id (drivers/base/auxiliary.c:174) ? mlx5r_mp_remove+0x160/0x160 mlx5_ib really_probe (drivers/base/dd.c:? drivers/base/dd.c:658) driver_probe_device (drivers/base/dd.c:830) __driver_attach (drivers/base/dd.c:1217) bus_for_each_dev (drivers/base/bus.c:369) ? driver_attach (drivers/base/dd.c:1157) bus_add_driver (drivers/base/bus.c:679) driver_register (drivers/base/driver.c:249) Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().") Reported-by: Breno Leitao Closes: https://lore.kernel.org/netdev/20250224-noisy-cordial-roadrunner-fad40c@leitao/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Tested-by: Breno Leitao Link: https://patch.msgid.link/20250225211023.96448-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 1b252e9459fdb..70c01bd1799e5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2141,21 +2141,15 @@ int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - /* rtnl_net_lock() assumes dev is not yet published by - * register_netdevice(). - */ - DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->dev_list)); - - rtnl_net_lock(net); - err = __register_netdevice_notifier_net(net, nb, false); + rtnl_net_dev_lock(dev); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); return err; } -- GitLab From de70981f295e7eab86325db3bf349fa676f16c42 Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Wed, 26 Feb 2025 00:35:26 +0000 Subject: [PATCH 942/989] gve: unlink old napi when stopping a queue using queue API When a queue is stopped using the ndo queue API, before destroying its page pool, the associated NAPI instance needs to be unlinked to avoid warnings. Handle this by calling page_pool_disable_direct_recycling() when stopping a queue. Cc: stable@vger.kernel.org Fixes: ebdfae0d377b ("gve: adopt page pool for DQ RDA mode") Reviewed-by: Praveen Kaligineedi Signed-off-by: Harshitha Ramamurthy Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250226003526.1546854-1-hramamurthy@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 8ac0047f1ada1..f0674a4435670 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -109,10 +109,12 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_rx_ring *rx = &priv->rx[idx]; if (!gve_rx_was_added_to_block(priv, idx)) return; + page_pool_disable_direct_recycling(rx->dqo.page_pool); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); -- GitLab From 49806fe6e61b045b5be8610e08b5a3083c109aa0 Mon Sep 17 00:00:00 2001 From: Mohammad Heib Date: Tue, 25 Feb 2025 13:28:52 +0200 Subject: [PATCH 943/989] net: Clear old fragment checksum value in napi_reuse_skb In certain cases, napi_get_frags() returns an skb that points to an old received fragment, This skb may have its skb->ip_summed, csum, and other fields set from previous fragment handling. Some network drivers set skb->ip_summed to either CHECKSUM_COMPLETE or CHECKSUM_UNNECESSARY when getting skb from napi_get_frags(), while others only set skb->ip_summed when RX checksum offload is enabled on the device, and do not set any value for skb->ip_summed when hardware checksum offload is disabled, assuming that the skb->ip_summed initiated to zero by napi_reuse_skb, ionic driver for example will ignore/unset any value for the ip_summed filed if HW checksum offload is disabled, and if we have a situation where the user disables the checksum offload during a traffic that could lead to the following errors shown in the kernel logs: dump_stack_lvl+0x34/0x48 __skb_gro_checksum_complete+0x7e/0x90 tcp6_gro_receive+0xc6/0x190 ipv6_gro_receive+0x1ec/0x430 dev_gro_receive+0x188/0x360 ? ionic_rx_clean+0x25a/0x460 [ionic] napi_gro_frags+0x13c/0x300 ? __pfx_ionic_rx_service+0x10/0x10 [ionic] ionic_rx_service+0x67/0x80 [ionic] ionic_cq_service+0x58/0x90 [ionic] ionic_txrx_napi+0x64/0x1b0 [ionic] __napi_poll+0x27/0x170 net_rx_action+0x29c/0x370 handle_softirqs+0xce/0x270 __irq_exit_rcu+0xa3/0xc0 common_interrupt+0x80/0xa0 This inconsistency sometimes leads to checksum validation issues in the upper layers of the network stack. To resolve this, this patch clears the skb->ip_summed value for each reused skb in by napi_reuse_skb(), ensuring that the caller is responsible for setting the correct checksum status. This eliminates potential checksum validation issues caused by improper handling of skb->ip_summed. Fixes: 76620aafd66f ("gro: New frags interface to avoid copying shinfo") Signed-off-by: Mohammad Heib Reviewed-by: Shannon Nelson Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250225112852.2507709-1-mheib@redhat.com Signed-off-by: Jakub Kicinski --- net/core/gro.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/gro.c b/net/core/gro.c index 78b320b631744..0ad549b07e039 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -653,6 +653,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { -- GitLab From 77e45145e3039a0fb212556ab3f8c87f54771757 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 23 Feb 2025 23:17:08 +0100 Subject: [PATCH 944/989] net: Handle napi_schedule() calls from non-interrupt napi_schedule() is expected to be called either: * From an interrupt, where raised softirqs are handled on IRQ exit * From a softirq disabled section, where raised softirqs are handled on the next call to local_bh_enable(). * From a softirq handler, where raised softirqs are handled on the next round in do_softirq(), or further deferred to a dedicated kthread. Other bare tasks context may end up ignoring the raised NET_RX vector until the next random softirq handling opportunity, which may not happen before a while if the CPU goes idle afterwards with the tick stopped. Such "misuses" have been detected on several places thanks to messages of the kind: "NOHZ tick-stop error: local softirq work is pending, handler #08!!!" For example: __raise_softirq_irqoff __napi_schedule rtl8152_runtime_resume.isra.0 rtl8152_resume usb_resume_interface.isra.0 usb_resume_both __rpm_callback rpm_callback rpm_resume __pm_runtime_resume usb_autoresume_device usb_remote_wakeup hub_event process_one_work worker_thread kthread ret_from_fork ret_from_fork_asm And also: * drivers/net/usb/r8152.c::rtl_work_func_t * drivers/net/netdevsim/netdev.c::nsim_start_xmit There is a long history of issues of this kind: 019edd01d174 ("ath10k: sdio: Add missing BH locking around napi_schdule()") 330068589389 ("idpf: disable local BH when scheduling napi for marker packets") e3d5d70cb483 ("net: lan78xx: fix "softirq work is pending" error") e55c27ed9ccf ("mt76: mt7615: add missing bh-disable around rx napi schedule") c0182aa98570 ("mt76: mt7915: add missing bh-disable around tx napi enable/schedule") 970be1dff26d ("mt76: disable BH around napi_schedule() calls") 019edd01d174 ("ath10k: sdio: Add missing BH locking around napi_schdule()") 30bfec4fec59 ("can: rx-offload: can_rx_offload_threaded_irq_finish(): add new function to be called from threaded interrupt") e63052a5dd3c ("mlx5e: add add missing BH locking around napi_schdule()") 83a0c6e58901 ("i40e: Invoke softirqs after napi_reschedule") bd4ce941c8d5 ("mlx4: Invoke softirqs after napi_reschedule") 8cf699ec849f ("mlx4: do not call napi_schedule() without care") ec13ee80145c ("virtio_net: invoke softirqs after __napi_schedule") This shows that relying on the caller to arrange a proper context for the softirqs to be handled while calling napi_schedule() is very fragile and error prone. Also fixing them can also prove challenging if the caller may be called from different kinds of contexts. Therefore fix this from napi_schedule() itself with waking up ksoftirqd when softirqs are raised from task contexts. Reported-by: Paul Menzel Reported-by: Jakub Kicinski Reported-by: Francois Romieu Closes: https://lore.kernel.org/lkml/354a2690-9bbf-4ccb-8769-fa94707a9340@molgen.mpg.de/ Cc: Breno Leitao Signed-off-by: Frederic Weisbecker Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250223221708.27130-1-frederic@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 70c01bd1799e5..30da277c5a6f8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4757,7 +4757,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS -- GitLab From bc23d4e30866011700787bab8563de45d5bf8431 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Tue, 25 Feb 2025 10:14:57 +0800 Subject: [PATCH 945/989] af_unix: Fix memory leak in unix_dgram_sendmsg() After running the 'sendmsg02' program of Linux Test Project (LTP), kmemleak reports the following memory leak: # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff888243866800 (size 2048): comm "sendmsg02", pid 67, jiffies 4294903166 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 5e 00 00 00 00 00 00 00 ........^....... 01 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace (crc 7e96a3f2): kmemleak_alloc+0x56/0x90 kmem_cache_alloc_noprof+0x209/0x450 sk_prot_alloc.constprop.0+0x60/0x160 sk_alloc+0x32/0xc0 unix_create1+0x67/0x2b0 unix_create+0x47/0xa0 __sock_create+0x12e/0x200 __sys_socket+0x6d/0x100 __x64_sys_socket+0x1b/0x30 x64_sys_call+0x7e1/0x2140 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e Commit 689c398885cc ("af_unix: Defer sock_put() to clean up path in unix_dgram_sendmsg().") defers sock_put() in the error handling path. However, it fails to account for the condition 'msg->msg_namelen != 0', resulting in a memory leak when the code jumps to the 'lookup' label. Fix issue by calling sock_put() if 'msg->msg_namelen != 0' is met. Fixes: 689c398885cc ("af_unix: Defer sock_put() to clean up path in unix_dgram_sendmsg().") Signed-off-by: Adrian Huang Acked-by: Joe Damato Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250225021457.1824-1-ahuang12@lenovo.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 34945de1fb1fa..f0e613d976640 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2102,6 +2102,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out_sock_put; } + sock_put(other); goto lookup; } -- GitLab From 2d253726ff7106b39a44483b6864398bba8a2f74 Mon Sep 17 00:00:00 2001 From: Harshal Chaudhari Date: Mon, 24 Feb 2025 20:20:58 -0800 Subject: [PATCH 946/989] net: mvpp2: cls: Fixed Non IP flow, with vlan tag flow defination. Non IP flow, with vlan tag not working as expected while running below command for vlan-priority. fixed that. ethtool -N eth1 flow-type ether vlan 0x8000 vlan-mask 0x1fff action 0 loc 0 Fixes: 1274daede3ef ("net: mvpp2: cls: Add steering based on vlan Id and priority.") Signed-off-by: Harshal Chaudhari Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20250225042058.2643838-1-hchaudhari@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 1641791a2d5b4..8ed83fb988624 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -324,7 +324,7 @@ static const struct mvpp2_cls_flow cls_flows[MVPP2_N_PRS_FLOWS] = { MVPP2_PRS_RI_VLAN_MASK), /* Non IP flow, with vlan tag */ MVPP2_DEF_FLOW(MVPP22_FLOW_ETHERNET, MVPP2_FL_NON_IP_TAG, - MVPP22_CLS_HEK_OPT_VLAN, + MVPP22_CLS_HEK_TAGGED, 0, 0), }; -- GitLab From 7f3528f7d2f98b70e19a6bb7b130fc82c079ac54 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 25 Feb 2025 09:26:06 +0200 Subject: [PATCH 947/989] net/mlx5: Fix vport QoS cleanup on error When enabling vport QoS fails, the scheduling node was never freed, causing a leak. Add the missing free and reset the vport scheduling node pointer to NULL. Fixes: be034baba83e ("net/mlx5: Make vport QoS enablement more flexible for future extensions") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250225072608.526866-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b7c843446e11..07a28073a49ea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -591,8 +591,11 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t sched_node->vport = vport; vport->qos.sched_node = sched_node; err = esw_qos_vport_enable(vport, parent, extack); - if (err) + if (err) { + __esw_qos_free_node(sched_node); esw_qos_put(esw); + vport->qos.sched_node = NULL; + } return err; } -- GitLab From 47bcd9bf3d231bfd4698d7d3013597490fd5e2d6 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 25 Feb 2025 09:26:07 +0200 Subject: [PATCH 948/989] net/mlx5: Restore missing trace event when enabling vport QoS Restore the `trace_mlx5_esw_vport_qos_create` event when creating the vport scheduling element. This trace event was lost during refactoring. Fixes: be034baba83e ("net/mlx5: Make vport QoS enablement more flexible for future extensions") Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250225072608.526866-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 07a28073a49ea..823c1ba456cd1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -564,6 +564,9 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_ return err; esw_qos_normalize_min_rate(parent->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, + vport->qos.sched_node->max_rate, + vport->qos.sched_node->bw_share); return 0; } -- GitLab From 2f5a6014eb168a97b24153adccfa663d3b282767 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 25 Feb 2025 09:26:08 +0200 Subject: [PATCH 949/989] net/mlx5: IRQ, Fix null string in debug print irq_pool_alloc() debug print can print a null string. Fix it by providing a default string to print. Fixes: 71e084e26414 ("net/mlx5: Allocating a pool of MSI-X vectors for SFs") Signed-off-by: Shay Drory Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501141055.SwfIphN0-lkp@intel.com/ Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20250225072608.526866-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 7db9cab9bedf6..d9362eabc6a1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -572,7 +572,7 @@ irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name, pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ; pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ; mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d", - name, size, start); + name ? name : "mlx5_pcif_pool", size, start); return pool; } -- GitLab From e521f516716de7895acd1b5b7fac788214a390b9 Mon Sep 17 00:00:00 2001 From: Caleb Connolly Date: Sat, 8 Feb 2025 22:30:54 +0000 Subject: [PATCH 950/989] dmaengine: Revert "dmaengine: qcom: bam_dma: Avoid writing unavailable register" This commit causes a hard crash on sdm845 and likely other platforms. Revert it until a proper fix is found. This reverts commit 57a7138d0627: ("dmaengine: qcom: bam_dma: Avoid writing unavailable register") Signed-off-by: Caleb Connolly Fixes: 57a7138d0627 ("dmaengine: qcom: bam_dma: Avoid writing unavailable register") Tested-by: Neil Armstrong # on sdm845-DB845c Tested-by: David Heidelberg Link: https://lore.kernel.org/r/20250208223112.142567-1-caleb.connolly@linaro.org Signed-off-by: Vinod Koul --- drivers/dma/qcom/bam_dma.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index c14557efd5770..bbc3276992bb0 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -59,9 +59,6 @@ struct bam_desc_hw { #define DESC_FLAG_NWD BIT(12) #define DESC_FLAG_CMD BIT(11) -#define BAM_NDP_REVISION_START 0x20 -#define BAM_NDP_REVISION_END 0x27 - struct bam_async_desc { struct virt_dma_desc vd; @@ -401,7 +398,6 @@ struct bam_device { /* dma start transaction tasklet */ struct tasklet_struct task; - u32 bam_revision; }; /** @@ -445,10 +441,8 @@ static void bam_reset(struct bam_device *bdev) writel_relaxed(val, bam_addr(bdev, 0, BAM_CTRL)); /* set descriptor threshold, start with 4 bytes */ - if (in_range(bdev->bam_revision, BAM_NDP_REVISION_START, - BAM_NDP_REVISION_END)) - writel_relaxed(DEFAULT_CNT_THRSHLD, - bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); + writel_relaxed(DEFAULT_CNT_THRSHLD, + bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); /* Enable default set of h/w workarounds, ie all except BAM_FULL_PIPE */ writel_relaxed(BAM_CNFG_BITS_DEFAULT, bam_addr(bdev, 0, BAM_CNFG_BITS)); @@ -1006,10 +1000,9 @@ static void bam_apply_new_config(struct bam_chan *bchan, maxburst = bchan->slave.src_maxburst; else maxburst = bchan->slave.dst_maxburst; - if (in_range(bdev->bam_revision, BAM_NDP_REVISION_START, - BAM_NDP_REVISION_END)) - writel_relaxed(maxburst, - bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); + + writel_relaxed(maxburst, + bam_addr(bdev, 0, BAM_DESC_CNT_TRSHLD)); } bchan->reconfigure = 0; @@ -1199,11 +1192,10 @@ static int bam_init(struct bam_device *bdev) u32 val; /* read revision and configuration information */ - val = readl_relaxed(bam_addr(bdev, 0, BAM_REVISION)); - if (!bdev->num_ees) + if (!bdev->num_ees) { + val = readl_relaxed(bam_addr(bdev, 0, BAM_REVISION)); bdev->num_ees = (val >> NUM_EES_SHIFT) & NUM_EES_MASK; - - bdev->bam_revision = val & REVISION_MASK; + } /* check that configured EE is within range */ if (bdev->ee >= bdev->num_ees) -- GitLab From 3603996432997f7c88da37a97062a46cda01ac9d Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 11 Dec 2024 10:06:28 +0100 Subject: [PATCH 951/989] drm/fbdev-dma: Add shadow buffering for deferred I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DMA areas are not necessarily backed by struct page, so we cannot rely on it for deferred I/O. Allocate a shadow buffer for drivers that require deferred I/O and use it as framebuffer memory. Fixes driver errors about being "Unable to handle kernel NULL pointer dereference at virtual address" or "Unable to handle kernel paging request at virtual address". The patch splits drm_fbdev_dma_driver_fbdev_probe() in an initial allocation, which creates the DMA-backed buffer object, and a tail that sets up the fbdev data structures. There is a tail function for direct memory mappings and a tail function for deferred I/O with the shadow buffer. It is no longer possible to use deferred I/O without shadow buffer. It can be re-added if there exists a reliably test for usable struct page in the allocated DMA-backed buffer object. Signed-off-by: Thomas Zimmermann Reported-by: Nuno Gonçalves CLoses: https://lore.kernel.org/dri-devel/CAEXMXLR55DziAMbv_+2hmLeH-jP96pmit6nhs6siB22cpQFr9w@mail.gmail.com/ Tested-by: Nuno Gonçalves Fixes: 5ab91447aa13 ("drm/tiny/ili9225: Use fbdev-dma") Cc: Thomas Zimmermann Cc: # v6.11+ Reviewed-by: Simona Vetter Reviewed-by: Javier Martinez Canillas Link: https://patchwork.freedesktop.org/patch/msgid/20241211090643.74250-1-tzimmermann@suse.de --- drivers/gpu/drm/drm_fbdev_dma.c | 217 +++++++++++++++++++++++--------- 1 file changed, 155 insertions(+), 62 deletions(-) diff --git a/drivers/gpu/drm/drm_fbdev_dma.c b/drivers/gpu/drm/drm_fbdev_dma.c index b14b581c059d3..02a516e771927 100644 --- a/drivers/gpu/drm/drm_fbdev_dma.c +++ b/drivers/gpu/drm/drm_fbdev_dma.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MIT #include +#include #include #include @@ -70,37 +71,102 @@ static const struct fb_ops drm_fbdev_dma_fb_ops = { .fb_destroy = drm_fbdev_dma_fb_destroy, }; -FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(drm_fbdev_dma, +FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(drm_fbdev_dma_shadowed, drm_fb_helper_damage_range, drm_fb_helper_damage_area); -static int drm_fbdev_dma_deferred_fb_mmap(struct fb_info *info, struct vm_area_struct *vma) +static void drm_fbdev_dma_shadowed_fb_destroy(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; - struct drm_framebuffer *fb = fb_helper->fb; - struct drm_gem_dma_object *dma = drm_fb_dma_get_gem_obj(fb, 0); + void *shadow = info->screen_buffer; + + if (!fb_helper->dev) + return; - if (!dma->map_noncoherent) - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + if (info->fbdefio) + fb_deferred_io_cleanup(info); + drm_fb_helper_fini(fb_helper); + vfree(shadow); - return fb_deferred_io_mmap(info, vma); + drm_client_buffer_vunmap(fb_helper->buffer); + drm_client_framebuffer_delete(fb_helper->buffer); + drm_client_release(&fb_helper->client); + drm_fb_helper_unprepare(fb_helper); + kfree(fb_helper); } -static const struct fb_ops drm_fbdev_dma_deferred_fb_ops = { +static const struct fb_ops drm_fbdev_dma_shadowed_fb_ops = { .owner = THIS_MODULE, .fb_open = drm_fbdev_dma_fb_open, .fb_release = drm_fbdev_dma_fb_release, - __FB_DEFAULT_DEFERRED_OPS_RDWR(drm_fbdev_dma), + FB_DEFAULT_DEFERRED_OPS(drm_fbdev_dma_shadowed), DRM_FB_HELPER_DEFAULT_OPS, - __FB_DEFAULT_DEFERRED_OPS_DRAW(drm_fbdev_dma), - .fb_mmap = drm_fbdev_dma_deferred_fb_mmap, - .fb_destroy = drm_fbdev_dma_fb_destroy, + .fb_destroy = drm_fbdev_dma_shadowed_fb_destroy, }; /* * struct drm_fb_helper */ +static void drm_fbdev_dma_damage_blit_real(struct drm_fb_helper *fb_helper, + struct drm_clip_rect *clip, + struct iosys_map *dst) +{ + struct drm_framebuffer *fb = fb_helper->fb; + size_t offset = clip->y1 * fb->pitches[0]; + size_t len = clip->x2 - clip->x1; + unsigned int y; + void *src; + + switch (drm_format_info_bpp(fb->format, 0)) { + case 1: + offset += clip->x1 / 8; + len = DIV_ROUND_UP(len + clip->x1 % 8, 8); + break; + case 2: + offset += clip->x1 / 4; + len = DIV_ROUND_UP(len + clip->x1 % 4, 4); + break; + case 4: + offset += clip->x1 / 2; + len = DIV_ROUND_UP(len + clip->x1 % 2, 2); + break; + default: + offset += clip->x1 * fb->format->cpp[0]; + len *= fb->format->cpp[0]; + break; + } + + src = fb_helper->info->screen_buffer + offset; + iosys_map_incr(dst, offset); /* go to first pixel within clip rect */ + + for (y = clip->y1; y < clip->y2; y++) { + iosys_map_memcpy_to(dst, 0, src, len); + iosys_map_incr(dst, fb->pitches[0]); + src += fb->pitches[0]; + } +} + +static int drm_fbdev_dma_damage_blit(struct drm_fb_helper *fb_helper, + struct drm_clip_rect *clip) +{ + struct drm_client_buffer *buffer = fb_helper->buffer; + struct iosys_map dst; + + /* + * For fbdev emulation, we only have to protect against fbdev modeset + * operations. Nothing else will involve the client buffer's BO. So it + * is sufficient to acquire struct drm_fb_helper.lock here. + */ + mutex_lock(&fb_helper->lock); + + dst = buffer->map; + drm_fbdev_dma_damage_blit_real(fb_helper, clip, &dst); + + mutex_unlock(&fb_helper->lock); + + return 0; +} static int drm_fbdev_dma_helper_fb_dirty(struct drm_fb_helper *helper, struct drm_clip_rect *clip) { @@ -112,6 +178,10 @@ static int drm_fbdev_dma_helper_fb_dirty(struct drm_fb_helper *helper, return 0; if (helper->fb->funcs->dirty) { + ret = drm_fbdev_dma_damage_blit(helper, clip); + if (drm_WARN_ONCE(dev, ret, "Damage blitter failed: ret=%d\n", ret)) + return ret; + ret = helper->fb->funcs->dirty(helper->fb, NULL, 0, 0, clip, 1); if (drm_WARN_ONCE(dev, ret, "Dirty helper failed: ret=%d\n", ret)) return ret; @@ -128,14 +198,80 @@ static const struct drm_fb_helper_funcs drm_fbdev_dma_helper_funcs = { * struct drm_fb_helper */ +static int drm_fbdev_dma_driver_fbdev_probe_tail(struct drm_fb_helper *fb_helper, + struct drm_fb_helper_surface_size *sizes) +{ + struct drm_device *dev = fb_helper->dev; + struct drm_client_buffer *buffer = fb_helper->buffer; + struct drm_gem_dma_object *dma_obj = to_drm_gem_dma_obj(buffer->gem); + struct drm_framebuffer *fb = fb_helper->fb; + struct fb_info *info = fb_helper->info; + struct iosys_map map = buffer->map; + + info->fbops = &drm_fbdev_dma_fb_ops; + + /* screen */ + info->flags |= FBINFO_VIRTFB; /* system memory */ + if (dma_obj->map_noncoherent) + info->flags |= FBINFO_READS_FAST; /* signal caching */ + info->screen_size = sizes->surface_height * fb->pitches[0]; + info->screen_buffer = map.vaddr; + if (!(info->flags & FBINFO_HIDE_SMEM_START)) { + if (!drm_WARN_ON(dev, is_vmalloc_addr(info->screen_buffer))) + info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); + } + info->fix.smem_len = info->screen_size; + + return 0; +} + +static int drm_fbdev_dma_driver_fbdev_probe_tail_shadowed(struct drm_fb_helper *fb_helper, + struct drm_fb_helper_surface_size *sizes) +{ + struct drm_client_buffer *buffer = fb_helper->buffer; + struct fb_info *info = fb_helper->info; + size_t screen_size = buffer->gem->size; + void *screen_buffer; + int ret; + + /* + * Deferred I/O requires struct page for framebuffer memory, + * which is not guaranteed for all DMA ranges. We thus create + * a shadow buffer in system memory. + */ + screen_buffer = vzalloc(screen_size); + if (!screen_buffer) + return -ENOMEM; + + info->fbops = &drm_fbdev_dma_shadowed_fb_ops; + + /* screen */ + info->flags |= FBINFO_VIRTFB; /* system memory */ + info->flags |= FBINFO_READS_FAST; /* signal caching */ + info->screen_buffer = screen_buffer; + info->fix.smem_len = screen_size; + + fb_helper->fbdefio.delay = HZ / 20; + fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; + + info->fbdefio = &fb_helper->fbdefio; + ret = fb_deferred_io_init(info); + if (ret) + goto err_vfree; + + return 0; + +err_vfree: + vfree(screen_buffer); + return ret; +} + int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct drm_client_dev *client = &fb_helper->client; struct drm_device *dev = fb_helper->dev; - bool use_deferred_io = false; struct drm_client_buffer *buffer; - struct drm_gem_dma_object *dma_obj; struct drm_framebuffer *fb; struct fb_info *info; u32 format; @@ -152,19 +288,9 @@ int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, sizes->surface_height, format); if (IS_ERR(buffer)) return PTR_ERR(buffer); - dma_obj = to_drm_gem_dma_obj(buffer->gem); fb = buffer->fb; - /* - * Deferred I/O requires struct page for framebuffer memory, - * which is not guaranteed for all DMA ranges. We thus only - * install deferred I/O if we have a framebuffer that requires - * it. - */ - if (fb->funcs->dirty) - use_deferred_io = true; - ret = drm_client_buffer_vmap(buffer, &map); if (ret) { goto err_drm_client_buffer_delete; @@ -185,45 +311,12 @@ int drm_fbdev_dma_driver_fbdev_probe(struct drm_fb_helper *fb_helper, drm_fb_helper_fill_info(info, fb_helper, sizes); - if (use_deferred_io) - info->fbops = &drm_fbdev_dma_deferred_fb_ops; + if (fb->funcs->dirty) + ret = drm_fbdev_dma_driver_fbdev_probe_tail_shadowed(fb_helper, sizes); else - info->fbops = &drm_fbdev_dma_fb_ops; - - /* screen */ - info->flags |= FBINFO_VIRTFB; /* system memory */ - if (dma_obj->map_noncoherent) - info->flags |= FBINFO_READS_FAST; /* signal caching */ - info->screen_size = sizes->surface_height * fb->pitches[0]; - info->screen_buffer = map.vaddr; - if (!(info->flags & FBINFO_HIDE_SMEM_START)) { - if (!drm_WARN_ON(dev, is_vmalloc_addr(info->screen_buffer))) - info->fix.smem_start = page_to_phys(virt_to_page(info->screen_buffer)); - } - info->fix.smem_len = info->screen_size; - - /* - * Only set up deferred I/O if the screen buffer supports - * it. If this disagrees with the previous test for ->dirty, - * mmap on the /dev/fb file might not work correctly. - */ - if (!is_vmalloc_addr(info->screen_buffer) && info->fix.smem_start) { - unsigned long pfn = info->fix.smem_start >> PAGE_SHIFT; - - if (drm_WARN_ON(dev, !pfn_to_page(pfn))) - use_deferred_io = false; - } - - /* deferred I/O */ - if (use_deferred_io) { - fb_helper->fbdefio.delay = HZ / 20; - fb_helper->fbdefio.deferred_io = drm_fb_helper_deferred_io; - - info->fbdefio = &fb_helper->fbdefio; - ret = fb_deferred_io_init(info); - if (ret) - goto err_drm_fb_helper_release_info; - } + ret = drm_fbdev_dma_driver_fbdev_probe_tail(fb_helper, sizes); + if (ret) + goto err_drm_fb_helper_release_info; return 0; -- GitLab From 6d48ad04075729519f6baaa1dc9e5a3a39d05f53 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Wed, 26 Feb 2025 21:28:41 +0800 Subject: [PATCH 952/989] MIPS: Ignore relocs against __ex_table for relocatable kernel Since commit 6f2c2f93a190 ("scripts/sorttable: Remove unneeded Elf_Rel"), sorttable no longer clears relocs against __ex_table, claiming "it was never used." But in fact MIPS relocatable kernel had been implicitly depending on this behavior, so after this commit the MIPS relocatable kernel has started to spit oops like: CPU 1 Unable to handle kernel paging request at virtual address 000000fffbbdbff8, epc == ffffffff818f9a6c, ra == ffffffff813ad7d0 ... ... Call Trace: [] __raw_copy_from_user+0x48/0x2fc [] cp_statx+0x1a0/0x1e0 [] do_statx_fd+0xa8/0x118 [] sys_statx+0xd8/0xf8 [] syscall_common+0x34/0x58 So ignore those relocs on our own to fix the issue. Fixes: 6f2c2f93a190 ("scripts/sorttable: Remove unneeded Elf_Rel") Signed-off-by: Xi Ruoyao Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/tools/relocs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/mips/boot/tools/relocs.c b/arch/mips/boot/tools/relocs.c index a88d66c46d7f7..9863e1d5c62e3 100644 --- a/arch/mips/boot/tools/relocs.c +++ b/arch/mips/boot/tools/relocs.c @@ -468,6 +468,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { int i; + struct section *extab_sec = sec_lookup("__ex_table"); + int extab_index = extab_sec ? extab_sec - secs : -1; /* Walk through the relocations */ for (i = 0; i < ehdr.e_shnum; i++) { @@ -480,6 +482,9 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, if (sec->shdr.sh_type != SHT_REL_TYPE) continue; + if (sec->shdr.sh_info == extab_index) + continue; + sec_symtab = sec->link; sec_applies = &secs[sec->shdr.sh_info]; if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) -- GitLab From fc20737d8b85691ecabab3739ed7d06c9b7bc00f Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 26 Feb 2025 16:48:26 -0500 Subject: [PATCH 953/989] efivarfs: allow creation of zero length files Temporarily allow the creation of zero length files in efivarfs so the 'fwupd' user space firmware update tool can continue to operate. This hack should be reverted as soon as the fwupd mechanisms for updating firmware have been fixed. fwupd has been coded to open a firmware file, close it, remove the immutable bit and write to it. Since commit 908af31f4896 ("efivarfs: fix error on write to new variable leaving remnants") this behaviour results in the first close removing the file which causes the second write to fail. To allow fwupd to keep working code up an indicator of size 1 if a write fails and only remove the file on that condition (so create at zero size is allowed). Tested-by: Richard Hughes Signed-off-by: James Bottomley [ardb: replace LVFS with fwupd, as suggested by Richard] Signed-off-by: Ard Biesheuvel --- fs/efivarfs/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index cb1b6d0c34545..c294a8fc566da 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -57,10 +57,11 @@ static ssize_t efivarfs_file_write(struct file *file, if (bytes == -ENOENT) { /* - * zero size signals to release that the write deleted - * the variable + * FIXME: temporary workaround for fwupdate, signal + * failed write with a 1 to keep created but not + * written files */ - i_size_write(inode, 0); + i_size_write(inode, 1); } else { i_size_write(inode, datasize + sizeof(attributes)); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); @@ -124,7 +125,8 @@ static int efivarfs_file_release(struct inode *inode, struct file *file) struct efivar_entry *var = inode->i_private; inode_lock(inode); - var->removed = (--var->open_count == 0 && i_size_read(inode) == 0); + /* FIXME: temporary work around for fwupdate */ + var->removed = (--var->open_count == 0 && i_size_read(inode) == 1); inode_unlock(inode); if (var->removed) -- GitLab From 1cf9631d836b289bd5490776551961c883ae8a4f Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Mon, 24 Feb 2025 20:29:17 +0300 Subject: [PATCH 954/989] usbnet: gl620a: fix endpoint checking in genelink_bind() Syzbot reports [1] a warning in usb_submit_urb() triggered by inconsistencies between expected and actually present endpoints in gl620a driver. Since genelink_bind() does not properly verify whether specified eps are in fact provided by the device, in this case, an artificially manufactured one, one may get a mismatch. Fix the issue by resorting to a usbnet utility function usbnet_get_endpoints(), usually reserved for this very problem. Check for endpoints and return early before proceeding further if any are missing. [1] Syzbot report: usb 5-1: Manufacturer: syz usb 5-1: SerialNumber: syz usb 5-1: config 0 descriptor?? gl620a 5-1:0.23 usb0: register 'gl620a' at usb-dummy_hcd.0-1, ... ------------[ cut here ]------------ usb 5-1: BOGUS urb xfer, pipe 3 != type 1 WARNING: CPU: 2 PID: 1841 at drivers/usb/core/urb.c:503 usb_submit_urb+0xe4b/0x1730 drivers/usb/core/urb.c:503 Modules linked in: CPU: 2 UID: 0 PID: 1841 Comm: kworker/2:2 Not tainted 6.12.0-syzkaller-07834-g06afb0f36106 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: mld mld_ifc_work RIP: 0010:usb_submit_urb+0xe4b/0x1730 drivers/usb/core/urb.c:503 ... Call Trace: usbnet_start_xmit+0x6be/0x2780 drivers/net/usb/usbnet.c:1467 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x9a/0x7b0 net/core/dev.c:3606 sch_direct_xmit+0x1ae/0xc30 net/sched/sch_generic.c:343 __dev_xmit_skb net/core/dev.c:3827 [inline] __dev_queue_xmit+0x13d4/0x43e0 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_resolve_output net/core/neighbour.c:1514 [inline] neigh_resolve_output+0x5bc/0x950 net/core/neighbour.c:1494 neigh_output include/net/neighbour.h:539 [inline] ip6_finish_output2+0xb1b/0x2070 net/ipv6/ip6_output.c:141 __ip6_finish_output net/ipv6/ip6_output.c:215 [inline] ip6_finish_output+0x3f9/0x1360 net/ipv6/ip6_output.c:226 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip6_output+0x1f8/0x540 net/ipv6/ip6_output.c:247 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] mld_sendpack+0x9f0/0x11d0 net/ipv6/mcast.c:1819 mld_send_cr net/ipv6/mcast.c:2120 [inline] mld_ifc_work+0x740/0xca0 net/ipv6/mcast.c:2651 process_one_work+0x9c5/0x1ba0 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x6c8/0xf00 kernel/workqueue.c:3391 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Reported-by: syzbot+d693c07c6f647e0388d3@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d693c07c6f647e0388d3 Fixes: 47ee3051c856 ("[PATCH] USB: usbnet (5/9) module for genesys gl620a cables") Cc: stable@vger.kernel.org Signed-off-by: Nikita Zhandarovich Link: https://patch.msgid.link/20250224172919.1220522-1-n.zhandarovich@fintech.ru Signed-off-by: Paolo Abeni --- drivers/net/usb/gl620a.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/usb/gl620a.c b/drivers/net/usb/gl620a.c index 46af78caf457a..0bfa37c140591 100644 --- a/drivers/net/usb/gl620a.c +++ b/drivers/net/usb/gl620a.c @@ -179,9 +179,7 @@ static int genelink_bind(struct usbnet *dev, struct usb_interface *intf) { dev->hard_mtu = GL_RCV_BUF_SIZE; dev->net->hard_header_len += 4; - dev->in = usb_rcvbulkpipe(dev->udev, dev->driver_info->in); - dev->out = usb_sndbulkpipe(dev->udev, dev->driver_info->out); - return 0; + return usbnet_get_endpoints(dev, intf); } static const struct driver_info genelink_info = { -- GitLab From c64a0727f9b1cbc63a5538c8c0014e9a175ad864 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 25 Feb 2025 18:51:38 +0100 Subject: [PATCH 955/989] net: ipv6: fix dst ref loop on input in seg6 lwt Prevent a dst ref loop on input in seg6_iptunnel. Fixes: af4a2209b134 ("ipv6: sr: use dst_cache in seg6_input") Cc: David Lebrun Cc: Ido Schimmel Reviewed-by: Ido Schimmel Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/seg6_iptunnel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 33833b2064c07..51583461ae29b 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -472,10 +472,18 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); @@ -490,7 +498,9 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); -- GitLab From 13e55fbaec176119cff68a7e1693b251c8883c5f Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 25 Feb 2025 18:51:39 +0100 Subject: [PATCH 956/989] net: ipv6: fix dst ref loop on input in rpl lwt Prevent a dst ref loop on input in rpl_iptunnel. Fixes: a7a29f9c361f ("net: ipv6: add rpl sr tunnel") Cc: Alexander Aring Cc: Ido Schimmel Reviewed-by: Ido Schimmel Signed-off-by: Justin Iurman Signed-off-by: Paolo Abeni --- net/ipv6/rpl_iptunnel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 0ac4283acdf20..7c05ac846646f 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -262,10 +262,18 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); @@ -280,7 +288,9 @@ static int rpl_input(struct sk_buff *skb) if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); -- GitLab From 1cbddbddee68d17feb6467fc556c144777af91ef Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 26 Feb 2025 18:19:57 +0000 Subject: [PATCH 957/989] selftests: drv-net: Check if combined-count exists Some drivers, like tg3, do not set combined-count: $ ethtool -l enp4s0f1 Channel parameters for enp4s0f1: Pre-set maximums: RX: 4 TX: 4 Other: n/a Combined: n/a Current hardware settings: RX: 4 TX: 1 Other: n/a Combined: n/a In the case where combined-count is not set, the ethtool netlink code in the kernel elides the value and the code in the test: netnl.channels_get(...) With a tg3 device, the returned dictionary looks like: {'header': {'dev-index': 3, 'dev-name': 'enp4s0f1'}, 'rx-max': 4, 'rx-count': 4, 'tx-max': 4, 'tx-count': 1} Note that the key 'combined-count' is missing. As a result of this missing key the test raises an exception: # Exception| if channels['combined-count'] == 0: # Exception| ~~~~~~~~^^^^^^^^^^^^^^^^^^ # Exception| KeyError: 'combined-count' Change the test to check if 'combined-count' is a key in the dictionary first and if not assume that this means the driver has separate RX and TX queues. With this change, the test now passes successfully on tg3 and mlx5 (which does have a 'combined-count'). Fixes: 1cf270424218 ("net: selftest: add test for netdev netlink queue-get API") Signed-off-by: Joe Damato Reviewed-by: David Wei Link: https://patch.msgid.link/20250226181957.212189-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/queues.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 38303da957ee5..8a518905a9f9c 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -45,10 +45,9 @@ def addremove_queues(cfg, nl) -> None: netnl = EthtoolFamily() channels = netnl.channels_get({'header': {'dev-index': cfg.ifindex}}) - if channels['combined-count'] == 0: - rx_type = 'rx' - else: - rx_type = 'combined' + rx_type = 'rx' + if channels.get('combined-count', 0) > 0: + rx_type = 'combined' expected = curr_queues - 1 cmd(f"ethtool -L {cfg.dev['ifname']} {rx_type} {expected}", timeout=10) -- GitLab From 674fcb4f4a7e3e277417a01788cc6daae47c3804 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Feb 2025 22:12:52 +0000 Subject: [PATCH 958/989] idpf: fix checksums set in idpf_rx_rsc() idpf_rx_rsc() uses skb_transport_offset(skb) while the transport header is not set yet. This triggers the following warning for CONFIG_DEBUG_NET=y builds. DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb)) [ 69.261620] WARNING: CPU: 7 PID: 0 at ./include/linux/skbuff.h:3020 idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261629] Modules linked in: vfat fat dummy bridge intel_uncore_frequency_tpmi intel_uncore_frequency_common intel_vsec_tpmi idpf intel_vsec cdc_ncm cdc_eem cdc_ether usbnet mii xhci_pci xhci_hcd ehci_pci ehci_hcd libeth [ 69.261644] CPU: 7 UID: 0 PID: 0 Comm: swapper/7 Tainted: G S W 6.14.0-smp-DEV #1697 [ 69.261648] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN [ 69.261650] RIP: 0010:idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261677] ? __warn (kernel/panic.c:242 kernel/panic.c:748) [ 69.261682] ? idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261687] ? report_bug (lib/bug.c:?) [ 69.261690] ? handle_bug (arch/x86/kernel/traps.c:285) [ 69.261694] ? exc_invalid_op (arch/x86/kernel/traps.c:309) [ 69.261697] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) [ 69.261700] ? __pfx_idpf_vport_splitq_napi_poll (drivers/net/ethernet/intel/idpf/idpf_txrx.c:4011) idpf [ 69.261704] ? idpf_vport_splitq_napi_poll (include/linux/skbuff.h:3020) idpf [ 69.261708] ? idpf_vport_splitq_napi_poll (drivers/net/ethernet/intel/idpf/idpf_txrx.c:3072) idpf [ 69.261712] __napi_poll (net/core/dev.c:7194) [ 69.261716] net_rx_action (net/core/dev.c:7265) [ 69.261718] ? __qdisc_run (net/sched/sch_generic.c:293) [ 69.261721] ? sched_clock (arch/x86/include/asm/preempt.h:84 arch/x86/kernel/tsc.c:288) [ 69.261726] handle_softirqs (kernel/softirq.c:561) Fixes: 3a8845af66edb ("idpf: add RX splitq napi poll support") Signed-off-by: Eric Dumazet Cc: Alan Brady Cc: Joshua Hay Cc: Willem de Bruijn Acked-by: Przemek Kitszel Link: https://patch.msgid.link/20250226221253.1927782-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 9be6a6b59c4e1..977741c414980 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -3013,7 +3013,6 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_size = rsc_seg_len; skb_reset_network_header(skb); - len = skb->len - skb_transport_offset(skb); if (ipv4) { struct iphdr *ipv4h = ip_hdr(skb); @@ -3022,6 +3021,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, /* Reset and set transport header offset in skb */ skb_set_transport_header(skb, sizeof(struct iphdr)); + len = skb->len - skb_transport_offset(skb); /* Compute the TCP pseudo header checksum*/ tcp_hdr(skb)->check = @@ -3031,6 +3031,7 @@ static int idpf_rx_rsc(struct idpf_rx_queue *rxq, struct sk_buff *skb, skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + len = skb->len - skb_transport_offset(skb); tcp_hdr(skb)->check = ~tcp_v6_check(len, &ipv6h->saddr, &ipv6h->daddr, 0); } -- GitLab From 54e1b4becf5e220be03db4e1be773c1310e8cbbd Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Thu, 27 Feb 2025 14:54:41 +0530 Subject: [PATCH 959/989] net: ti: icss-iep: Reject perout generation request IEP driver supports both perout and pps signal generation but perout feature is faulty with half-cooked support due to some missing configuration. Remove perout support from the driver and reject perout requests with "not supported" error code. Fixes: c1e0230eeaab2 ("net: ti: icss-iep: Add IEP driver") Signed-off-by: Meghana Malladi Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250227092441.1848419-1-m-malladi@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssg/icss_iep.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icss_iep.c b/drivers/net/ethernet/ti/icssg/icss_iep.c index 768578c0d9587..d59c1744840af 100644 --- a/drivers/net/ethernet/ti/icssg/icss_iep.c +++ b/drivers/net/ethernet/ti/icssg/icss_iep.c @@ -474,26 +474,7 @@ static int icss_iep_perout_enable_hw(struct icss_iep *iep, static int icss_iep_perout_enable(struct icss_iep *iep, struct ptp_perout_request *req, int on) { - int ret = 0; - - mutex_lock(&iep->ptp_clk_mutex); - - if (iep->pps_enabled) { - ret = -EBUSY; - goto exit; - } - - if (iep->perout_enabled == !!on) - goto exit; - - ret = icss_iep_perout_enable_hw(iep, req, on); - if (!ret) - iep->perout_enabled = !!on; - -exit: - mutex_unlock(&iep->ptp_clk_mutex); - - return ret; + return -EOPNOTSUPP; } static void icss_iep_cap_cmp_work(struct work_struct *work) -- GitLab From 2b90e7ace79774a3540ce569e000388f8d22c9e0 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Wed, 26 Feb 2025 15:18:39 -0500 Subject: [PATCH 960/989] efi: Don't map the entire mokvar table to determine its size Currently, when validating the mokvar table, we (re)map the entire table on each iteration of the loop, adding space as we discover new entries. If the table grows over a certain size, this fails due to limitations of early_memmap(), and we get a failure and traceback: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at mm/early_ioremap.c:139 __early_ioremap+0xef/0x220 ... Call Trace: ? __early_ioremap+0xef/0x220 ? __warn.cold+0x93/0xfa ? __early_ioremap+0xef/0x220 ? report_bug+0xff/0x140 ? early_fixup_exception+0x5d/0xb0 ? early_idt_handler_common+0x2f/0x3a ? __early_ioremap+0xef/0x220 ? efi_mokvar_table_init+0xce/0x1d0 ? setup_arch+0x864/0xc10 ? start_kernel+0x6b/0xa10 ? x86_64_start_reservations+0x24/0x30 ? x86_64_start_kernel+0xed/0xf0 ? common_startup_64+0x13e/0x141 ---[ end trace 0000000000000000 ]--- mokvar: Failed to map EFI MOKvar config table pa=0x7c4c3000, size=265187. Mapping the entire structure isn't actually necessary, as we don't ever need more than one entry header mapped at once. Changes efi_mokvar_table_init() to only map each entry header, not the entire table, when determining the table size. Since we're not mapping any data past the variable name, it also changes the code to enforce that each variable name is NUL terminated, rather than attempting to verify it in place. Cc: Signed-off-by: Peter Jones Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/mokvar-table.c | 41 +++++++++-------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/firmware/efi/mokvar-table.c b/drivers/firmware/efi/mokvar-table.c index 5ed0602c2f75f..d865cb1dbaad1 100644 --- a/drivers/firmware/efi/mokvar-table.c +++ b/drivers/firmware/efi/mokvar-table.c @@ -103,7 +103,6 @@ void __init efi_mokvar_table_init(void) void *va = NULL; unsigned long cur_offset = 0; unsigned long offset_limit; - unsigned long map_size = 0; unsigned long map_size_needed = 0; unsigned long size; struct efi_mokvar_table_entry *mokvar_entry; @@ -134,48 +133,34 @@ void __init efi_mokvar_table_init(void) */ err = -EINVAL; while (cur_offset + sizeof(*mokvar_entry) <= offset_limit) { - mokvar_entry = va + cur_offset; - map_size_needed = cur_offset + sizeof(*mokvar_entry); - if (map_size_needed > map_size) { - if (va) - early_memunmap(va, map_size); - /* - * Map a little more than the fixed size entry - * header, anticipating some data. It's safe to - * do so as long as we stay within current memory - * descriptor. - */ - map_size = min(map_size_needed + 2*EFI_PAGE_SIZE, - offset_limit); - va = early_memremap(efi.mokvar_table, map_size); - if (!va) { - pr_err("Failed to map EFI MOKvar config table pa=0x%lx, size=%lu.\n", - efi.mokvar_table, map_size); - return; - } - mokvar_entry = va + cur_offset; + if (va) + early_memunmap(va, sizeof(*mokvar_entry)); + va = early_memremap(efi.mokvar_table + cur_offset, sizeof(*mokvar_entry)); + if (!va) { + pr_err("Failed to map EFI MOKvar config table pa=0x%lx, size=%zu.\n", + efi.mokvar_table + cur_offset, sizeof(*mokvar_entry)); + return; } + mokvar_entry = va; /* Check for last sentinel entry */ if (mokvar_entry->name[0] == '\0') { if (mokvar_entry->data_size != 0) break; err = 0; + map_size_needed = cur_offset + sizeof(*mokvar_entry); break; } - /* Sanity check that the name is null terminated */ - size = strnlen(mokvar_entry->name, - sizeof(mokvar_entry->name)); - if (size >= sizeof(mokvar_entry->name)) - break; + /* Enforce that the name is NUL terminated */ + mokvar_entry->name[sizeof(mokvar_entry->name) - 1] = '\0'; /* Advance to the next entry */ - cur_offset = map_size_needed + mokvar_entry->data_size; + cur_offset += sizeof(*mokvar_entry) + mokvar_entry->data_size; } if (va) - early_memunmap(va, map_size); + early_memunmap(va, sizeof(*mokvar_entry)); if (err) { pr_err("EFI MOKvar config table is not valid\n"); return; -- GitLab From e3cf2d91d0583cae70aeb512da87e3ade25ea912 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 27 Feb 2025 14:30:22 +0100 Subject: [PATCH 961/989] efi/mokvar-table: Avoid repeated map/unmap of the same page Tweak the logic that traverses the MOKVAR UEFI configuration table to only unmap the entry header and map the next one if they don't live in the same physical page. Link: https://lore.kernel.org/all/8f085931-3e9d-4386-9209-1d6c95616327@uncooperative.org/ Tested-By: Peter Jones Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/mokvar-table.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/efi/mokvar-table.c b/drivers/firmware/efi/mokvar-table.c index d865cb1dbaad1..208db29613c63 100644 --- a/drivers/firmware/efi/mokvar-table.c +++ b/drivers/firmware/efi/mokvar-table.c @@ -99,13 +99,13 @@ static struct kobject *mokvar_kobj; */ void __init efi_mokvar_table_init(void) { + struct efi_mokvar_table_entry __aligned(1) *mokvar_entry, *next_entry; efi_memory_desc_t md; void *va = NULL; unsigned long cur_offset = 0; unsigned long offset_limit; unsigned long map_size_needed = 0; unsigned long size; - struct efi_mokvar_table_entry *mokvar_entry; int err; if (!efi_enabled(EFI_MEMMAP)) @@ -142,7 +142,7 @@ void __init efi_mokvar_table_init(void) return; } mokvar_entry = va; - +next: /* Check for last sentinel entry */ if (mokvar_entry->name[0] == '\0') { if (mokvar_entry->data_size != 0) @@ -156,7 +156,19 @@ void __init efi_mokvar_table_init(void) mokvar_entry->name[sizeof(mokvar_entry->name) - 1] = '\0'; /* Advance to the next entry */ - cur_offset += sizeof(*mokvar_entry) + mokvar_entry->data_size; + size = sizeof(*mokvar_entry) + mokvar_entry->data_size; + cur_offset += size; + + /* + * Don't bother remapping if the current entry header and the + * next one end on the same page. + */ + next_entry = (void *)((unsigned long)mokvar_entry + size); + if (((((unsigned long)(mokvar_entry + 1) - 1) ^ + ((unsigned long)(next_entry + 1) - 1)) & PAGE_MASK) == 0) { + mokvar_entry = next_entry; + goto next; + } } if (va) -- GitLab From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:51 +0000 Subject: [PATCH 962/989] mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear() In order to fix a bug, arm64 needs to be told the size of the huge page for which the huge_pte is being cleared in huge_ptep_get_and_clear(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear() and set_huge_pte_at(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, loongarch, mips, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Acked-by: David Hildenbrand Reviewed-by: Alexandre Ghiti # riscv Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Acked-by: Alexander Gordeev # s390 Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hugetlb.h | 4 ++-- arch/arm64/mm/hugetlbpage.c | 8 +++++--- arch/loongarch/include/asm/hugetlb.h | 6 ++++-- arch/mips/include/asm/hugetlb.h | 6 ++++-- arch/parisc/include/asm/hugetlb.h | 2 +- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/include/asm/hugetlb.h | 6 ++++-- arch/riscv/include/asm/hugetlb.h | 3 ++- arch/riscv/mm/hugetlbpage.c | 2 +- arch/s390/include/asm/hugetlb.h | 16 ++++++++++++---- arch/s390/mm/hugetlbpage.c | 4 ++-- arch/sparc/include/asm/hugetlb.h | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- include/asm-generic/hugetlb.h | 2 +- include/linux/hugetlb.h | 4 +++- mm/hugetlb.c | 4 ++-- 16 files changed, 46 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6dff3e69539b..03db9cb21ace8 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,8 +42,8 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 98a2a0e64e255..06db4649af916 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -396,8 +396,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, __pte_clear(mm, addr, ptep); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) { int ncontig; size_t pgsize; @@ -549,6 +549,8 @@ bool __init arch_hugetlb_valid_size(unsigned long size) pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long psize = huge_page_size(hstate_vma(vma)); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings @@ -558,7 +560,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index c8e4057734d0d..4dc4b3e04225f 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -36,7 +36,8 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = ptep_get(ptep); @@ -51,8 +52,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index d0a86ce83de91..fbc71ddcf0f68 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -27,7 +27,8 @@ static inline int prepare_hugepage_range(struct file *file, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = *ptep; @@ -42,13 +43,14 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 5b3a5429f71b3..21e9ace177395 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -10,7 +10,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index e9d18cf25b792..a94fe546d434f 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -126,7 +126,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t entry; diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dad2e7980f245..86326587e58de 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -45,7 +45,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); } @@ -55,8 +56,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_hugetlb_page(vma, addr); return pte; } diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index faf3624d80577..4461264977684 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -28,7 +28,8 @@ void set_huge_pte_at(struct mm_struct *mm, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + unsigned long addr, pte_t *ptep, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 42314f0939220..b4a78a4b35cff 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -293,7 +293,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t orig_pte = ptep_get(ptep); int pte_num; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7c52acaf9f828..663e87220e89f 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,8 +25,16 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) +{ + return __huge_ptep_get_and_clear(mm, addr, ptep); +} static inline void arch_clear_hugetlb_flags(struct folio *folio) { @@ -48,7 +56,7 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -59,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; @@ -69,7 +77,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de9..2e568f175cd41 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -188,8 +188,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index c714ca6a05aa0..e7a9cdd498dca 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,7 +20,7 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index eee601a0d2cfb..80504148d8a5b 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -260,7 +260,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { unsigned int i, nptes, orig_shift, shift; unsigned long size; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f42133dae68e5..2afc95bf1655f 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -90,7 +90,7 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f959..bf5f7256bd281 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1004,7 +1004,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460a..de9d49e521c13 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5447,7 +5447,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5622,7 +5622,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); -- GitLab From 49c87f7677746f3c5bd16c81b23700bb6b88bfd4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:52 +0000 Subject: [PATCH 963/989] arm64: hugetlb: Fix huge_ptep_get_and_clear() for non-present ptes arm64 supports multiple huge_pte sizes. Some of the sizes are covered by a single pte entry at a particular level (PMD_SIZE, PUD_SIZE), and some are covered by multiple ptes at a particular level (CONT_PTE_SIZE, CONT_PMD_SIZE). So the function has to figure out the size from the huge_pte pointer. This was previously done by walking the pgtable to determine the level and by using the PTE_CONT bit to determine the number of ptes at the level. But the PTE_CONT bit is only valid when the pte is present. For non-present pte values (e.g. markers, migration entries), the previous implementation was therefore erroneously determining the size. There is at least one known caller in core-mm, move_huge_pte(), which may call huge_ptep_get_and_clear() for a non-present pte. So we must be robust to this case. Additionally the "regular" ptep_get_and_clear() is robust to being called for non-present ptes so it makes sense to follow the behavior. Fix this by using the new sz parameter which is now provided to the function. Additionally when clearing each pte in a contig range, don't gather the access and dirty bits if the pte is not present. An alternative approach that would not require API changes would be to store the PTE_CONT bit in a spare bit in the swap entry pte for the non-present case. But it felt cleaner to follow other APIs' lead and just pass in the size. As an aside, PTE_CONT is bit 52, which corresponds to bit 40 in the swap entry offset field (layout of non-present pte). Since hugetlb is never swapped to disk, this field will only be populated for markers, which always set this bit to 0 and hwpoison swap entries, which set the offset field to a PFN; So it would only ever be 1 for a 52-bit PVA system where memory in that high half was poisoned (I think!). So in practice, this bit would almost always be zero for non-present ptes and we would only clear the first entry if it was actually a contiguous block. That's probably a less severe symptom than if it was always interpreted as 1 and cleared out potentially-present neighboring PTEs. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250226120656.2400136-3-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/hugetlbpage.c | 53 ++++++++++++++----------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 06db4649af916..b3a7fafe8892d 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -100,20 +100,11 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, static inline int num_contig_ptes(unsigned long size, size_t *pgsize) { - int contig_ptes = 0; + int contig_ptes = 1; *pgsize = size; switch (size) { -#ifndef __PAGETABLE_PMD_FOLDED - case PUD_SIZE: - if (pud_sect_supported()) - contig_ptes = 1; - break; -#endif - case PMD_SIZE: - contig_ptes = 1; - break; case CONT_PMD_SIZE: *pgsize = PMD_SIZE; contig_ptes = CONT_PMDS; @@ -122,6 +113,8 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) *pgsize = PAGE_SIZE; contig_ptes = CONT_PTES; break; + default: + WARN_ON(!__hugetlb_valid_size(size)); } return contig_ptes; @@ -163,24 +156,23 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = __ptep_get(ptep); - unsigned long i; - - for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = __ptep_get_and_clear(mm, addr, ptep); - - /* - * If HW_AFDBM is enabled, then the HW could turn on - * the dirty or accessed bit for any page in the set, - * so check them all. - */ - if (pte_dirty(pte)) - orig_pte = pte_mkdirty(orig_pte); - - if (pte_young(pte)) - orig_pte = pte_mkyoung(orig_pte); + pte_t pte, tmp_pte; + bool present; + + pte = __ptep_get_and_clear(mm, addr, ptep); + present = pte_present(pte); + while (--ncontig) { + ptep++; + addr += pgsize; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (present) { + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } } - return orig_pte; + return pte; } static pte_t get_clear_contig_flush(struct mm_struct *mm, @@ -401,13 +393,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, { int ncontig; size_t pgsize; - pte_t orig_pte = __ptep_get(ptep); - - if (!pte_cont(orig_pte)) - return __ptep_get_and_clear(mm, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(sz, &pgsize); return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } -- GitLab From eed6bfa8b28230382b797a88569f2c7569a1a419 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:53 +0000 Subject: [PATCH 964/989] arm64: hugetlb: Fix flush_hugetlb_tlb_range() invalidation level commit c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") changed the "invalidation level unknown" hint from 0 to TLBI_TTL_UNKNOWN (INT_MAX). But the fallback "unknown level" path in flush_hugetlb_tlb_range() was not updated. So as it stands, when trying to invalidate CONT_PMD_SIZE or CONT_PTE_SIZE hugetlb mappings, we will spuriously try to invalidate at level 0 on LPA2-enabled systems. Fix this so that the fallback passes TLBI_TTL_UNKNOWN, and while we are at it, explicitly use the correct stride and level for CONT_PMD_SIZE and CONT_PTE_SIZE, which should provide a minor optimization. Cc: stable@vger.kernel.org Fixes: c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250226120656.2400136-4-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hugetlb.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 03db9cb21ace8..07fbf5bf85a7e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -76,12 +76,22 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, { unsigned long stride = huge_page_size(hstate_vma(vma)); - if (stride == PMD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 2); - else if (stride == PUD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 1); - else - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); + switch (stride) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + __flush_tlb_range(vma, start, end, PUD_SIZE, false, 1); + break; +#endif + case CONT_PMD_SIZE: + case PMD_SIZE: + __flush_tlb_range(vma, start, end, PMD_SIZE, false, 2); + break; + case CONT_PTE_SIZE: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 3); + break; + default: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + } } #endif /* __ASM_HUGETLB_H */ -- GitLab From 82c387ef7568c0d96a918a5a78d9cad6256cfa15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Dec 2024 14:20:56 +0100 Subject: [PATCH 965/989] sched/core: Prevent rescheduling when interrupts are disabled David reported a warning observed while loop testing kexec jump: Interrupts enabled after irqrouter_resume+0x0/0x50 WARNING: CPU: 0 PID: 560 at drivers/base/syscore.c:103 syscore_resume+0x18a/0x220 kernel_kexec+0xf6/0x180 __do_sys_reboot+0x206/0x250 do_syscall_64+0x95/0x180 The corresponding interrupt flag trace: hardirqs last enabled at (15573): [] __up_console_sem+0x7e/0x90 hardirqs last disabled at (15580): [] __up_console_sem+0x63/0x90 That means __up_console_sem() was invoked with interrupts enabled. Further instrumentation revealed that in the interrupt disabled section of kexec jump one of the syscore_suspend() callbacks woke up a task, which set the NEED_RESCHED flag. A later callback in the resume path invoked cond_resched() which in turn led to the invocation of the scheduler: __cond_resched+0x21/0x60 down_timeout+0x18/0x60 acpi_os_wait_semaphore+0x4c/0x80 acpi_ut_acquire_mutex+0x3d/0x100 acpi_ns_get_node+0x27/0x60 acpi_ns_evaluate+0x1cb/0x2d0 acpi_rs_set_srs_method_data+0x156/0x190 acpi_pci_link_set+0x11c/0x290 irqrouter_resume+0x54/0x60 syscore_resume+0x6a/0x200 kernel_kexec+0x145/0x1c0 __do_sys_reboot+0xeb/0x240 do_syscall_64+0x95/0x180 This is a long standing problem, which probably got more visible with the recent printk changes. Something does a task wakeup and the scheduler sets the NEED_RESCHED flag. cond_resched() sees it set and invokes schedule() from a completely bogus context. The scheduler enables interrupts after context switching, which causes the above warning at the end. Quite some of the code paths in syscore_suspend()/resume() can result in triggering a wakeup with the exactly same consequences. They might not have done so yet, but as they share a lot of code with normal operations it's just a question of time. The problem only affects the PREEMPT_NONE and PREEMPT_VOLUNTARY scheduling models. Full preemption is not affected as cond_resched() is disabled and the preemption check preemptible() takes the interrupt disabled flag into account. Cure the problem by adding a corresponding check into cond_resched(). Reported-by: David Woodhouse Suggested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Tested-by: David Woodhouse Cc: Linus Torvalds Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/all/7717fe2ac0ce5f0a2c43fdab8b11f4483d54a2a4.camel@infradead.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9aecd914ac691..67189907214d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7285,7 +7285,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } -- GitLab From 6f86bdeab633a56d5c6dccf1a2c5989b6a5e323e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Feb 2025 16:39:44 -0500 Subject: [PATCH 966/989] tracing: Fix bad hist from corrupting named_triggers list The following commands causes a crash: ~# cd /sys/kernel/tracing/events/rcu/rcu_callback ~# echo 'hist:name=bad:keys=common_pid:onmax(bogus).save(common_pid)' > trigger bash: echo: write error: Invalid argument ~# echo 'hist:name=bad:keys=common_pid' > trigger Because the following occurs: event_trigger_write() { trigger_process_regex() { event_hist_trigger_parse() { data = event_trigger_alloc(..); event_trigger_register(.., data) { cmd_ops->reg(.., data, ..) [hist_register_trigger()] { data->ops->init() [event_hist_trigger_init()] { save_named_trigger(name, data) { list_add(&data->named_list, &named_triggers); } } } } ret = create_actions(); (return -EINVAL) if (ret) goto out_unreg; [..] ret = hist_trigger_enable(data, ...) { list_add_tail_rcu(&data->list, &file->triggers); <<<---- SKIPPED!!! (this is important!) [..] out_unreg: event_hist_unregister(.., data) { cmd_ops->unreg(.., data, ..) [hist_unregister_trigger()] { list_for_each_entry(iter, &file->triggers, list) { if (!hist_trigger_match(data, iter, named_data, false)) <- never matches continue; [..] test = iter; } if (test && test->ops->free) <<<-- test is NULL test->ops->free(test) [event_hist_trigger_free()] { [..] if (data->name) del_named_trigger(data) { list_del(&data->named_list); <<<<-- NEVER gets removed! } } } } [..] kfree(data); <<<-- frees item but it is still on list The next time a hist with name is registered, it causes an u-a-f bug and the kernel can crash. Move the code around such that if event_trigger_register() succeeds, the next thing called is hist_trigger_enable() which adds it to the list. A bunch of actions is called if get_named_trigger_data() returns false. But that doesn't need to be called after event_trigger_register(), so it can be moved up, allowing event_trigger_register() to be called just before hist_trigger_enable() keeping them together and allowing the file->triggers to be properly populated. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250227163944.1c37f85f@gandalf.local.home Fixes: 067fe038e70f6 ("tracing: Add variable reference handling to hist triggers") Reported-by: Tomas Glozar Tested-by: Tomas Glozar Reviewed-by: Tom Zanussi Closes: https://lore.kernel.org/all/CAP4=nvTsxjckSBTz=Oe_UYh8keD9_sZC4i++4h72mJLic4_W4A@mail.gmail.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_hist.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 261163b00137a..ad7419e240556 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6724,27 +6724,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; -- GitLab From 3908b6baf2ac20138915b5ca98338b4f063954d8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 26 Feb 2025 15:27:03 +0100 Subject: [PATCH 967/989] selftests/ftrace: Let fprobe test consider already enabled functions The fprobe test fails on Fedora 41 since the fprobe test assumption that the number of enabled_functions is zero before the test starts is not necessarily true. Some user space tools, like systemd, add BPF programs that attach to functions. Those will show up in the enabled_functions table and must be taken into account by the fprobe test. Therefore count the number of lines of enabled_functions before tests start, and use that as base when comparing expected results. Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250226142703.910860-1-hca@linux.ibm.com Fixes: e85c5e9792b9 ("selftests/ftrace: Update fprobe test to check enabled_functions file") Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- .../test.d/dynevent/add_remove_fprobe.tc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc index 449f9d8be7462..73f6c6fcecabe 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe.tc @@ -10,12 +10,16 @@ PLACE=$FUNCTION_FORK PLACE2="kmem_cache_free" PLACE3="schedule_timeout" +# Some functions may have BPF programs attached, therefore +# count already enabled_functions before tests start +ocnt=`cat enabled_functions | wc -l` + echo "f:myevent1 $PLACE" >> dynamic_events # Make sure the event is attached and is the only one grep -q $PLACE enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -23,7 +27,7 @@ echo "f:myevent2 $PLACE%return" >> dynamic_events # It should till be the only attached function cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -32,7 +36,7 @@ echo "f:myevent3 $PLACE2" >> dynamic_events grep -q $PLACE2 enabled_functions cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 2 ]; then +if [ $cnt -ne $((ocnt + 2)) ]; then exit_fail fi @@ -49,7 +53,7 @@ grep -q myevent1 dynamic_events # should still have 2 left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 2 ]; then +if [ $cnt -ne $((ocnt + 2)) ]; then exit_fail fi @@ -57,7 +61,7 @@ echo > dynamic_events # Should have none left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 0 ]; then +if [ $cnt -ne $ocnt ]; then exit_fail fi @@ -65,7 +69,7 @@ echo "f:myevent4 $PLACE" >> dynamic_events # Should only have one enabled cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 1 ]; then +if [ $cnt -ne $((ocnt + 1)) ]; then exit_fail fi @@ -73,7 +77,7 @@ echo > dynamic_events # Should have none left cnt=`cat enabled_functions | wc -l` -if [ $cnt -ne 0 ]; then +if [ $cnt -ne $ocnt ]; then exit_fail fi -- GitLab From a1a7eb89ca0b89dc1c326eeee2596f263291aca3 Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 6 Feb 2025 12:01:56 +0300 Subject: [PATCH 968/989] ftrace: Avoid potential division by zero in function_stat_show() Check whether denominator expression x * (x - 1) * 1000 mod {2^32, 2^64} produce zero and skip stddev computation in that case. For now don't care about rec->counter * rec->counter overflow because rec->time * rec->time overflow will likely happen earlier. Cc: stable@vger.kernel.org Cc: Wen Yang Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250206090156.1561783-1-kniv@yandex-team.ru Fixes: e31f7939c1c27 ("ftrace: Avoid potential division by zero in function profiler") Signed-off-by: Nikolay Kuratov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6b0c25761ccb1..fc88e0688daf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -540,6 +540,7 @@ static int function_stat_show(struct seq_file *m, void *v) static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif guard(mutex)(&ftrace_profile_lock); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); -- GitLab From c5b0320bbf79548fbf058a3925a07c8f281beeab Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Mon, 6 Jan 2025 19:14:13 +0000 Subject: [PATCH 969/989] iommu/amd: Preserve default DTE fields when updating Host Page Table Root When updating the page table root field on the DTE, avoid overwriting any bits that are already set. The earlier call to make_clear_dte() writes default values that all DTEs must have set (currently DTE[V]), and those must be preserved. Currently this doesn't cause problems since the page table root update is the first field that is set after make_clear_dte() is called, and DTE_FLAG_V is set again later along with the permission bits (IR/IW). Remove this redundant assignment too. Fixes: fd5dff9de4be ("iommu/amd: Modify set_dte_entry() to use 256-bit DTE helpers") Signed-off-by: Alejandro Jimenez Reviewed-by: Dheeraj Kumar Srivastava Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250106191413.3107140-1-alejandro.j.jimenez@oracle.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b48a72bd7b23d..cd5116d8c3b28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2043,12 +2043,12 @@ static void set_dte_entry(struct amd_iommu *iommu, make_clear_dte(dev_data, dte, &new); if (domain->iop.mode != PAGE_MODE_NONE) - new.data[0] = iommu_virt_to_phys(domain->iop.root); + new.data[0] |= iommu_virt_to_phys(domain->iop.root); new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; - new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V; + new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; /* * When SNP is enabled, we can only support TV=1 with non-zero domain ID. -- GitLab From 64f792981e35e191eb619f6f2fefab76cc7d6112 Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Fri, 28 Feb 2025 18:27:25 +0800 Subject: [PATCH 970/989] iommu/vt-d: Remove device comparison in context_setup_pass_through_cb Remove the device comparison check in context_setup_pass_through_cb. pci_for_each_dma_alias already makes a decision on whether the callback function should be called for a device. With the check in place it will fail to create context entries for aliases as it walks up to the root bus. Fixes: 2031c469f816 ("iommu/vt-d: Add support for static identity domain") Closes: https://lore.kernel.org/linux-iommu/82499eb6-00b7-4f83-879a-e97b4144f576@linux.intel.com/ Cc: stable@vger.kernel.org Signed-off-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20250224180316.140123-1-jsnitsel@redhat.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index cc46098f875b1..4d8d4593c9c81 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4378,9 +4378,6 @@ static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void * { struct device *dev = data; - if (dev != &pdev->dev) - return 0; - return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); } -- GitLab From b150654f74bf0df8e6a7936d5ec51400d9ec06d8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Fri, 28 Feb 2025 18:27:26 +0800 Subject: [PATCH 971/989] iommu/vt-d: Fix suspicious RCU usage Commit ("iommu/vt-d: Allocate DMAR fault interrupts locally") moved the call to enable_drhd_fault_handling() to a code path that does not hold any lock while traversing the drhd list. Fix it by ensuring the dmar_global_lock lock is held when traversing the drhd list. Without this fix, the following warning is triggered: ============================= WARNING: suspicious RCU usage 6.14.0-rc3 #55 Not tainted ----------------------------- drivers/iommu/intel/dmar.c:2046 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 2 locks held by cpuhp/1/23: #0: ffffffff84a67c50 (cpu_hotplug_lock){++++}-{0:0}, at: cpuhp_thread_fun+0x87/0x2c0 #1: ffffffff84a6a380 (cpuhp_state-up){+.+.}-{0:0}, at: cpuhp_thread_fun+0x87/0x2c0 stack backtrace: CPU: 1 UID: 0 PID: 23 Comm: cpuhp/1 Not tainted 6.14.0-rc3 #55 Call Trace: dump_stack_lvl+0xb7/0xd0 lockdep_rcu_suspicious+0x159/0x1f0 ? __pfx_enable_drhd_fault_handling+0x10/0x10 enable_drhd_fault_handling+0x151/0x180 cpuhp_invoke_callback+0x1df/0x990 cpuhp_thread_fun+0x1ea/0x2c0 smpboot_thread_fn+0x1f5/0x2e0 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0x12a/0x2d0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x4a/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Holding the lock in enable_drhd_fault_handling() triggers a lockdep splat about a possible deadlock between dmar_global_lock and cpu_hotplug_lock. This is avoided by not holding dmar_global_lock when calling iommu_device_register(), which initiates the device probe process. Fixes: d74169ceb0d2 ("iommu/vt-d: Allocate DMAR fault interrupts locally") Reported-and-tested-by: Ido Schimmel Closes: https://lore.kernel.org/linux-iommu/Zx9OwdLIc_VoQ0-a@shredder.mtl.com/ Tested-by: Breno Leitao Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250218022422.2315082-1-baolu.lu@linux.intel.com Tested-by: Ido Schimmel Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 1 + drivers/iommu/intel/iommu.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 9f424acf474e9..e540092d664d2 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2043,6 +2043,7 @@ int enable_drhd_fault_handling(unsigned int cpu) /* * Enable fault control interrupt. */ + guard(rwsem_read)(&dmar_global_lock); for_each_iommu(iommu, drhd) { u32 fault_status; int ret; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4d8d4593c9c81..bf1f0c8143483 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3146,7 +3146,14 @@ int __init intel_iommu_init(void) iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, "%s", iommu->name); + /* + * The iommu device probe is protected by the iommu_probe_device_lock. + * Release the dmar_global_lock before entering the device probe path + * to avoid unnecessary lock order splat. + */ + up_read(&dmar_global_lock); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); + down_read(&dmar_global_lock); iommu_pmu_register(iommu); } -- GitLab From b654f7a51ffb386131de42aa98ed831f8c126546 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 28 Feb 2025 21:26:56 +0800 Subject: [PATCH 972/989] block: fix 'kmem_cache of name 'bio-108' already exists' Device mapper bioset often has big bio_slab size, which can be more than 1000, then 8byte can't hold the slab name any more, cause the kmem_cache allocation warning of 'kmem_cache of name 'bio-108' already exists'. Fix the warning by extending bio_slab->name to 12 bytes, but fix output of /proc/slabinfo Reported-by: Guangwu Zhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250228132656.2838008-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index f0c416e5931d9..6ac5983ba51e6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -77,7 +77,7 @@ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; - char name[8]; + char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); -- GitLab From 64407f4b5807dc9dec8135e1bfd45d2cb11b4ea0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 28 Feb 2025 16:03:47 +0300 Subject: [PATCH 973/989] gpiolib: Fix Oops in gpiod_direction_input_nonotify() The gpiod_direction_input_nonotify() function is supposed to return zero if the direction for the pin is input. But instead it accidentally returns GPIO_LINE_DIRECTION_IN (1) which will be cast into an ERR_PTR() in gpiochip_request_own_desc(). The callers dereference it and it leads to a crash. I changed gpiod_direction_output_raw_commit() just for consistency but returning GPIO_LINE_DIRECTION_OUT (0) is fine. Cc: stable@vger.kernel.org Fixes: 9d846b1aebbe ("gpiolib: check the return value of gpio_chip::get_direction()") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/254f3925-3015-4c9d-aac5-bb9b4b2cd2c5@stanley.mountain [Bartosz: moved the variable declarations to the top of the functions] Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index fc19df5a64c2b..8741600af7efb 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2712,7 +2712,7 @@ EXPORT_SYMBOL_GPL(gpiod_direction_input); int gpiod_direction_input_nonotify(struct gpio_desc *desc) { - int ret = 0; + int ret = 0, dir; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) @@ -2740,12 +2740,12 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) ret = guard.gc->direction_input(guard.gc, gpio_chip_hwgpio(desc)); } else if (guard.gc->get_direction) { - ret = guard.gc->get_direction(guard.gc, + dir = guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc)); - if (ret < 0) - return ret; + if (dir < 0) + return dir; - if (ret != GPIO_LINE_DIRECTION_IN) { + if (dir != GPIO_LINE_DIRECTION_IN) { gpiod_warn(desc, "%s: missing direction_input() operation and line is output\n", __func__); @@ -2764,7 +2764,7 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc) static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) { - int val = !!value, ret = 0; + int val = !!value, ret = 0, dir; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) @@ -2788,12 +2788,12 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) } else { /* Check that we are in output mode if we can */ if (guard.gc->get_direction) { - ret = guard.gc->get_direction(guard.gc, + dir = guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc)); - if (ret < 0) - return ret; + if (dir < 0) + return dir; - if (ret != GPIO_LINE_DIRECTION_OUT) { + if (dir != GPIO_LINE_DIRECTION_OUT) { gpiod_warn(desc, "%s: missing direction_output() operation\n", __func__); -- GitLab From c157d351460bcf202970e97e611cb6b54a3dd4a4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2025 23:37:08 +0100 Subject: [PATCH 974/989] intel_idle: Handle older CPUs, which stop the TSC in deeper C states, correctly The Intel idle driver is preferred over the ACPI processor idle driver, but fails to implement the work around for Core2 generation CPUs, where the TSC stops in C2 and deeper C-states. This causes stalls and boot delays, when the clocksource watchdog does not catch the unstable TSC before the CPU goes deep idle for the first time. The ACPI driver marks the TSC unstable when it detects that the CPU supports C2 or deeper and the CPU does not have a non-stop TSC. Add the equivivalent work around to the Intel idle driver to cure that. Fixes: 18734958e9bf ("intel_idle: Use ACPI _CST for processor models without C-state tables") Reported-by: Fab Stz Signed-off-by: Thomas Gleixner Tested-by: Fab Stz Cc: All applicable Closes: https://lore.kernel.org/all/10cf96aa-1276-4bd4-8966-c890377030c3@yahoo.fr Link: https://patch.msgid.link/87bjupfy7f.ffs@tglx Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 118fe1d37c226..0fdb1d1316c44 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #define INTEL_IDLE_VERSION "0.5.1" @@ -1799,6 +1800,9 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) if (intel_idle_state_needs_timer_stop(state)) state->flags |= CPUIDLE_FLAG_TIMER_STOP; + if (cx->type > ACPI_STATE_C1 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halts in idle"); + state->enter = intel_idle; state->enter_s2idle = intel_idle_s2idle; } -- GitLab From cb380909ae3b1ebf14d6a455a4f92d7916d790cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:30 -0800 Subject: [PATCH 975/989] vhost: return task creation error instead of NULL Lets callers distinguish why the vhost task creation failed. No one currently cares why it failed, so no real runtime change from this patch, but that will not be the case for long. Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-2-kbusch@meta.com> Reviewed-by: Mike Christie Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- drivers/vhost/vhost.c | 2 +- kernel/vhost_task.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d4ac4a1f8b81b..18ca1ea6dc240 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7471,7 +7471,7 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); - if (!nx_thread) + if (IS_ERR(nx_thread)) return; vhost_task_start(nx_thread); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9ac25d08f473e..63612faeab727 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -666,7 +666,7 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, worker, name); - if (!vtsk) + if (IS_ERR(vtsk)) goto free_worker; mutex_init(&worker->mutex); diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 8800f5acc0071..2ef2e1b800916 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) - return NULL; + return ERR_PTR(-ENOMEM); init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return NULL; + return ERR_PTR(PTR_ERR(tsk)); } vtsk->task = tsk; -- GitLab From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: [PATCH 976/989] kvm: retry nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [implemented the kvm user side] Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-3-kbusch@meta.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++----- include/linux/call_once.h | 47 ++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18ca1ea6dc240..8160870398b90 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7460,7 +7460,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7472,12 +7472,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm, "kvm-nx-lpage-recovery"); if (IS_ERR(nx_thread)) - return; + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7485,10 +7486,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb00..13cd6469e7e56 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -26,20 +26,41 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +/* + * call_once - Ensure a function has been called exactly once + * + * @once: Tracking struct + * @cb: Function to be called + * + * If @once has never completed successfully before, call @cb and, if + * it returns a zero or positive value, mark @once as completed. Return + * the value returned by @cb + * + * If @once has completed succesfully before, return 0. + * + * The call to @cb is implicitly surrounded by a mutex, though for + * efficiency the * function avoids taking it after the first call. + */ +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; - - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; - - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + int r, state; + + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; + + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r < 0) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ -- GitLab From a2f925a2f62254119cdaa360cfc9c0424bccd531 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 28 Feb 2025 13:26:04 +0100 Subject: [PATCH 977/989] Revert "ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives" This reverts commit cc77e2ce187d26cc66af3577bf896d7410eb25ab. It was reported that adding ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives breaks entering lower package states for certain systems. It turns out that Samsung SSD 870 QVO actually has working LPM when using a recent SSD firmware version. The author of commit cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") reported himself that only older SSD firmware versions have broken LPM: https://lore.kernel.org/stable/93c10d38-718c-459d-84a5-4d87680b4da7@debian.org/ Unfortunately, he did not specify which older firmware version he was using which had broken LPM. Let's revert this quirk, which has FW version field specified as NULL (which means that it applies for all Samsung SSD 870 QVO firmware versions) for now. Once the author reports which older firmware version(s) that are broken, we can create a more fine grained quirk, which populates the FW version field accordingly. Fixes: cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") Reported-by: Dieter Mummenschanz Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219747 Link: https://lore.kernel.org/r/20250228122603.91814-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 63ec2f2184319..c085dd81ebe7f 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4143,10 +4143,6 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, - { "Samsung SSD 870 QVO*", NULL, ATA_QUIRK_NO_NCQ_TRIM | - ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI | - ATA_QUIRK_NOLPM }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, -- GitLab From 7eb172143d5508b4da468ed59ee857c6e5e01da6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Mar 2025 11:48:20 -0800 Subject: [PATCH 978/989] Linux 6.14-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 30dab4c8b0120..70bdbf2218fc5 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* -- GitLab From 4363f02a39e25e80e68039b4323c570b0848ec66 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 3 Mar 2025 14:55:52 +0800 Subject: [PATCH 979/989] ASoC: Intel: sof_sdw: Fix unlikely uninitialized variable use in create_sdw_dailinks() Initialize current_be_id to 0 to handle the unlikely case when there are no devices connected to a DAI. In this case create_sdw_dailink() would return without touching the passed pointer to current_be_id. Found by gcc -fanalyzer Fixes: 59bf457d8055 ("ASoC: intel: sof_sdw: Factor out SoundWire DAI creation") Signed-off-by: Peter Ujfalusi Cc: stable@vger.kernel.org Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://patch.msgid.link/20250303065552.78328-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index c13064c777261..90dafa810b2ec 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -954,7 +954,7 @@ static int create_sdw_dailinks(struct snd_soc_card *card, /* generate DAI links by each sdw link */ while (sof_dais->initialised) { - int current_be_id; + int current_be_id = 0; ret = create_sdw_dailink(card, sof_dais, dai_links, ¤t_be_id, codec_conf); -- GitLab From d776f016d24816f15033169dcd081f077b6c10f4 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Fri, 21 Feb 2025 04:40:24 +0000 Subject: [PATCH 980/989] ASoC: codecs: wsa884x: report temps to hwmon in millidegree of Celsius MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Temperatures are reported in units of Celsius however hwmon expects values to be in millidegree of Celsius. Userspace tools observe values close to zero and report it as "Not available" or incorrect values like 0C or 1C. Add a simple conversion to fix that. Before the change: wsa884x-virtual-0 Adapter: Virtual device temp1: +0.0°C -- wsa884x-virtual-0 Adapter: Virtual device temp1: +0.0°C Also reported as N/A before first amplifier power on. After this change and initial wsa884x power on: wsa884x-virtual-0 Adapter: Virtual device temp1: +39.0°C -- wsa884x-virtual-0 Adapter: Virtual device temp1: +37.0°C Tested on sm8550 only. Cc: Krzysztof Kozlowski Cc: Srinivas Kandagatla Signed-off-by: Alexey Klimov Link: https://patch.msgid.link/20250221044024.1207921-1-alexey.klimov@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/wsa884x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wsa884x.c b/sound/soc/codecs/wsa884x.c index 86df5152c547b..560a2c04b6955 100644 --- a/sound/soc/codecs/wsa884x.c +++ b/sound/soc/codecs/wsa884x.c @@ -1875,7 +1875,7 @@ static int wsa884x_get_temp(struct wsa884x_priv *wsa884x, long *temp) * Reading temperature is possible only when Power Amplifier is * off. Report last cached data. */ - *temp = wsa884x->temperature; + *temp = wsa884x->temperature * 1000; return 0; } @@ -1934,7 +1934,7 @@ static int wsa884x_get_temp(struct wsa884x_priv *wsa884x, long *temp) if ((val > WSA884X_LOW_TEMP_THRESHOLD) && (val < WSA884X_HIGH_TEMP_THRESHOLD)) { wsa884x->temperature = val; - *temp = val; + *temp = val * 1000; ret = 0; } else { ret = -EAGAIN; -- GitLab From 3d6c9dd4cb3013fe83524949b914f1497855e3de Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 22 Feb 2025 23:56:59 +0100 Subject: [PATCH 981/989] ASoC: tegra: Fix ADX S24_LE audio format Commit 4204eccc7b2a ("ASoC: tegra: Add support for S24_LE audio format") added support for the S24_LE audio format, but duplicated S16_LE in OUT_DAI() for ADX instead. Fix this by adding support for the S24_LE audio format. Compile-tested only. Cc: stable@vger.kernel.org Fixes: 4204eccc7b2a ("ASoC: tegra: Add support for S24_LE audio format") Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250222225700.539673-2-thorsten.blum@linux.dev Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_adx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/tegra/tegra210_adx.c b/sound/soc/tegra/tegra210_adx.c index 3e6e8f51f380b..0aa93b948378f 100644 --- a/sound/soc/tegra/tegra210_adx.c +++ b/sound/soc/tegra/tegra210_adx.c @@ -264,7 +264,7 @@ static const struct snd_soc_dai_ops tegra210_adx_out_dai_ops = { .rates = SNDRV_PCM_RATE_8000_192000, \ .formats = SNDRV_PCM_FMTBIT_S8 | \ SNDRV_PCM_FMTBIT_S16_LE | \ - SNDRV_PCM_FMTBIT_S16_LE | \ + SNDRV_PCM_FMTBIT_S24_LE | \ SNDRV_PCM_FMTBIT_S32_LE, \ }, \ .capture = { \ @@ -274,7 +274,7 @@ static const struct snd_soc_dai_ops tegra210_adx_out_dai_ops = { .rates = SNDRV_PCM_RATE_8000_192000, \ .formats = SNDRV_PCM_FMTBIT_S8 | \ SNDRV_PCM_FMTBIT_S16_LE | \ - SNDRV_PCM_FMTBIT_S16_LE | \ + SNDRV_PCM_FMTBIT_S24_LE | \ SNDRV_PCM_FMTBIT_S32_LE, \ }, \ .ops = &tegra210_adx_out_dai_ops, \ -- GitLab From 164b7dd4546b57c08b373e9e3cf315ff98cb032d Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Tue, 4 Mar 2025 14:05:04 +0000 Subject: [PATCH 982/989] ASoC: cs42l43: Add jack delay debounce after suspend Hardware reports jack absent after reset/suspension regardless of jack state, so introduce an additional delay only in suspension case to allow proper detection to take place after a short delay. Signed-off-by: Maciej Strozek Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20250304140504.139245-1-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43-jack.c | 13 ++++++++++--- sound/soc/codecs/cs42l43.c | 15 ++++++++++++++- sound/soc/codecs/cs42l43.h | 3 +++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/sound/soc/codecs/cs42l43-jack.c b/sound/soc/codecs/cs42l43-jack.c index d9ab003e166bf..ac19a572fe70c 100644 --- a/sound/soc/codecs/cs42l43-jack.c +++ b/sound/soc/codecs/cs42l43-jack.c @@ -167,7 +167,7 @@ int cs42l43_set_jack(struct snd_soc_component *component, autocontrol |= 0x3 << CS42L43_JACKDET_MODE_SHIFT; ret = cs42l43_find_index(priv, "cirrus,tip-fall-db-ms", 500, - NULL, cs42l43_accdet_db_ms, + &priv->tip_fall_db_ms, cs42l43_accdet_db_ms, ARRAY_SIZE(cs42l43_accdet_db_ms)); if (ret < 0) goto error; @@ -175,7 +175,7 @@ int cs42l43_set_jack(struct snd_soc_component *component, tip_deb |= ret << CS42L43_TIPSENSE_FALLING_DB_TIME_SHIFT; ret = cs42l43_find_index(priv, "cirrus,tip-rise-db-ms", 500, - NULL, cs42l43_accdet_db_ms, + &priv->tip_rise_db_ms, cs42l43_accdet_db_ms, ARRAY_SIZE(cs42l43_accdet_db_ms)); if (ret < 0) goto error; @@ -764,6 +764,8 @@ void cs42l43_tip_sense_work(struct work_struct *work) error: mutex_unlock(&priv->jack_lock); + priv->suspend_jack_debounce = false; + pm_runtime_mark_last_busy(priv->dev); pm_runtime_put_autosuspend(priv->dev); } @@ -771,14 +773,19 @@ void cs42l43_tip_sense_work(struct work_struct *work) irqreturn_t cs42l43_tip_sense(int irq, void *data) { struct cs42l43_codec *priv = data; + unsigned int db_delay = priv->tip_debounce_ms; cancel_delayed_work(&priv->bias_sense_timeout); cancel_delayed_work(&priv->tip_sense_work); cancel_delayed_work(&priv->button_press_work); cancel_work(&priv->button_release_work); + // Ensure delay after suspend is long enough to avoid false detection + if (priv->suspend_jack_debounce) + db_delay += priv->tip_fall_db_ms + priv->tip_rise_db_ms; + queue_delayed_work(system_long_wq, &priv->tip_sense_work, - msecs_to_jiffies(priv->tip_debounce_ms)); + msecs_to_jiffies(db_delay)); return IRQ_HANDLED; } diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index d2a2daefc2ec6..4257dbefe9dd1 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -2402,9 +2402,22 @@ static int cs42l43_codec_runtime_resume(struct device *dev) return 0; } +static int cs42l43_codec_runtime_force_suspend(struct device *dev) +{ + struct cs42l43_codec *priv = dev_get_drvdata(dev); + + dev_dbg(priv->dev, "Runtime suspend\n"); + + priv->suspend_jack_debounce = true; + + pm_runtime_force_suspend(dev); + + return 0; +} + static const struct dev_pm_ops cs42l43_codec_pm_ops = { RUNTIME_PM_OPS(NULL, cs42l43_codec_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume) + SET_SYSTEM_SLEEP_PM_OPS(cs42l43_codec_runtime_force_suspend, pm_runtime_force_resume) }; static const struct platform_device_id cs42l43_codec_id_table[] = { diff --git a/sound/soc/codecs/cs42l43.h b/sound/soc/codecs/cs42l43.h index 9c144e129535f..1cd9d8a71c439 100644 --- a/sound/soc/codecs/cs42l43.h +++ b/sound/soc/codecs/cs42l43.h @@ -78,6 +78,8 @@ struct cs42l43_codec { bool use_ring_sense; unsigned int tip_debounce_ms; + unsigned int tip_fall_db_ms; + unsigned int tip_rise_db_ms; unsigned int bias_low; unsigned int bias_sense_ua; unsigned int bias_ramp_ms; @@ -95,6 +97,7 @@ struct cs42l43_codec { bool button_detect_running; bool jack_present; int jack_override; + bool suspend_jack_debounce; struct work_struct hp_ilimit_work; struct delayed_work hp_ilimit_clear_work; -- GitLab From 927e6bec5cf3624665b0a2e9f64a1d32f3d22cdd Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 5 Mar 2025 21:41:13 +0800 Subject: [PATCH 983/989] ASoC: rt1320: set wake_capable = 0 explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "generic_new_peripheral_assigned: invalid dev_num 1, wake supported 1" is reported by our internal CI test. Rt1320's wake feature is not used in Linux and that's why it is not in the wake_capable_list[] list in intel_auxdevice.c. However, BIOS may set it as wake-capable. Overwrite wake_capable to 0 in the codec driver to align with wake_capable_list[]. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Acked-by: Shuming Fan Link: https://patch.msgid.link/20250305134113.201326-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1320-sdw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/codecs/rt1320-sdw.c b/sound/soc/codecs/rt1320-sdw.c index 3510c3819074b..d83b236a04503 100644 --- a/sound/soc/codecs/rt1320-sdw.c +++ b/sound/soc/codecs/rt1320-sdw.c @@ -535,6 +535,9 @@ static int rt1320_read_prop(struct sdw_slave *slave) /* set the timeout values */ prop->clk_stop_timeout = 64; + /* BIOS may set wake_capable. Make sure it is 0 as wake events are disabled. */ + prop->wake_capable = 0; + return 0; } -- GitLab From 0eba2a7e858907a746ba69cd002eb9eb4dbd7bf3 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 28 Feb 2025 15:14:56 +0000 Subject: [PATCH 984/989] ASoC: ops: Consistently treat platform_max as control value This reverts commit 9bdd10d57a88 ("ASoC: ops: Shift tested values in snd_soc_put_volsw() by +min"), and makes some additional related updates. There are two ways the platform_max could be interpreted; the maximum register value, or the maximum value the control can be set to. The patch moved from treating the value as a control value to a register one. When the patch was applied it was technically correct as snd_soc_limit_volume() also used the register interpretation. However, even then most of the other usages treated platform_max as a control value, and snd_soc_limit_volume() has since been updated to also do so in commit fb9ad24485087 ("ASoC: ops: add correct range check for limiting volume"). That patch however, missed updating snd_soc_put_volsw() back to the control interpretation, and fixing snd_soc_info_volsw_range(). The control interpretation makes more sense as limiting is typically done from the machine driver, so it is appropriate to use the customer facing representation rather than the internal codec representation. Update all the code to consistently use this interpretation of platform_max. Finally, also add some comments to the soc_mixer_control struct to hopefully avoid further patches switching between the two approaches. Fixes: fb9ad24485087 ("ASoC: ops: add correct range check for limiting volume") Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250228151456.3703342-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc.h | 5 ++++- sound/soc/soc-ops.c | 15 +++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/sound/soc.h b/include/sound/soc.h index fcdb5adfcd5ec..b3e84bc47c6fd 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1261,7 +1261,10 @@ void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd); /* mixer control */ struct soc_mixer_control { - int min, max, platform_max; + /* Minimum and maximum specified as written to the hardware */ + int min, max; + /* Limited maximum value specified as presented through the control */ + int platform_max; int reg, rreg; unsigned int shift, rshift; unsigned int sign_bit; diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index 19928f098d8dc..b0e4e4168f38d 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -337,7 +337,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[0] < 0) return -EINVAL; val = ucontrol->value.integer.value[0]; - if (mc->platform_max && ((int)val + min) > mc->platform_max) + if (mc->platform_max && val > mc->platform_max) return -EINVAL; if (val > max - min) return -EINVAL; @@ -350,7 +350,7 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol, if (ucontrol->value.integer.value[1] < 0) return -EINVAL; val2 = ucontrol->value.integer.value[1]; - if (mc->platform_max && ((int)val2 + min) > mc->platform_max) + if (mc->platform_max && val2 > mc->platform_max) return -EINVAL; if (val2 > max - min) return -EINVAL; @@ -503,17 +503,16 @@ int snd_soc_info_volsw_range(struct snd_kcontrol *kcontrol, { struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value; - int platform_max; - int min = mc->min; + int max; - if (!mc->platform_max) - mc->platform_max = mc->max; - platform_max = mc->platform_max; + max = mc->max - mc->min; + if (mc->platform_max && mc->platform_max < max) + max = mc->platform_max; uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = snd_soc_volsw_is_stereo(mc) ? 2 : 1; uinfo->value.integer.min = 0; - uinfo->value.integer.max = platform_max - min; + uinfo->value.integer.max = max; return 0; } -- GitLab From e26f1cfeac6712516bfeed80890da664f4f2e88a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 6 Mar 2025 13:32:54 +0000 Subject: [PATCH 985/989] ASoC: cs42l43: Fix maximum ADC Volume The range of ADC volume is -1 -> 3 (-6 to 18dB) so the number of levels should actually be 4. Fixes: fc918cbe874e ("ASoC: cs42l43: Add support for the cs42l43") Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250306133254.1861046-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index 4257dbefe9dd1..d307b56a7f38e 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -1146,7 +1146,7 @@ static const struct snd_kcontrol_new cs42l43_controls[] = { SOC_DOUBLE_R_SX_TLV("ADC Volume", CS42L43_ADC_B_CTRL1, CS42L43_ADC_B_CTRL2, CS42L43_ADC_PGA_GAIN_SHIFT, - 0xF, 5, cs42l43_adc_tlv), + 0xF, 4, cs42l43_adc_tlv), SOC_DOUBLE("PDM1 Invert Switch", CS42L43_DMIC_PDM_CTRL, CS42L43_PDM1L_INV_SHIFT, CS42L43_PDM1R_INV_SHIFT, 1, 0), -- GitLab From 0704a15b930cf97073ce091a0cd7ad32f2304329 Mon Sep 17 00:00:00 2001 From: Thomas Mizrahi Date: Sat, 8 Mar 2025 01:06:28 -0300 Subject: [PATCH 986/989] ASoC: amd: yc: Support mic on another Lenovo ThinkPad E16 Gen 2 model The internal microphone on the Lenovo ThinkPad E16 model requires a quirk entry to work properly. This was fixed in a previous patch (linked below), but depending on the specific variant of the model, the product name may be "21M5" or "21M6". The following patch fixed this issue for the 21M5 variant: https://lore.kernel.org/all/20240725065442.9293-1-tiwai@suse.de/ This patch adds support for the microphone on the 21M6 variant. Link: https://github.com/ramaureirac/thinkpad-e14-linux/issues/31 Cc: stable@vger.kernel.org Signed-off-by: Thomas Mizrahi Link: https://patch.msgid.link/20250308041303.198765-1-thomasmizra@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index b16587d8f97a8..a7637056972aa 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -248,6 +248,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21M5"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "21M6"), + } + }, { .driver_data = &acp6x_card, .matches = { -- GitLab From 247fba13416af65b155949bae582d55c310f58b6 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Mon, 10 Mar 2025 16:04:40 +0800 Subject: [PATCH 987/989] ASoC: rt722-sdca: add missing readable registers SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_FU15, RT722_SDCA_CTL_FU_CH_GAIN, CH_01) ... SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_FU15, RT722_SDCA_CTL_FU_CH_GAIN, CH_04) are used by the "FU15 Boost Volume" control, but not marked as readable. And the mbq size are 2 for those registers. Fixes: 7f5d6036ca005 ("ASoC: rt722-sdca: Add RT722 SDCA driver") Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Shuming Fan Link: https://patch.msgid.link/20250310080440.58797-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index 25fc13687bc83..4d3043627bd04 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -86,6 +86,10 @@ static bool rt722_sdca_mbq_readable_register(struct device *dev, unsigned int re case 0x6100067: case 0x6100070 ... 0x610007c: case 0x6100080: + case SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_FU15, RT722_SDCA_CTL_FU_CH_GAIN, + CH_01) ... + SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_FU15, RT722_SDCA_CTL_FU_CH_GAIN, + CH_04): case SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_USER_FU1E, RT722_SDCA_CTL_FU_VOLUME, CH_01): case SDW_SDCA_CTL(FUNC_NUM_MIC_ARRAY, RT722_SDCA_ENT_USER_FU1E, RT722_SDCA_CTL_FU_VOLUME, -- GitLab From ed92bc5264c4357d4fca292c769ea9967cd3d3b6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 10 Mar 2025 18:45:36 +0100 Subject: [PATCH 988/989] ASoC: codecs: wm0010: Fix error handling path in wm0010_spi_probe() Free some resources in the error handling path of the probe, as already done in the remove function. Fixes: e3523e01869d ("ASoC: wm0010: Add initial wm0010 DSP driver") Fixes: fd8b96574456 ("ASoC: wm0010: Clear IRQ as wake source and include missing header") Signed-off-by: Christophe JAILLET Reviewed-by: Charles Keepax Link: https://patch.msgid.link/5139ba1ab8c4c157ce04e56096a0f54a1683195c.1741549792.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown --- sound/soc/codecs/wm0010.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/wm0010.c b/sound/soc/codecs/wm0010.c index edd2cb185c42c..9e67fbfc2ccaf 100644 --- a/sound/soc/codecs/wm0010.c +++ b/sound/soc/codecs/wm0010.c @@ -920,7 +920,7 @@ static int wm0010_spi_probe(struct spi_device *spi) if (ret) { dev_err(wm0010->dev, "Failed to set IRQ %d as wake source: %d\n", irq, ret); - return ret; + goto free_irq; } if (spi->max_speed_hz) @@ -932,9 +932,18 @@ static int wm0010_spi_probe(struct spi_device *spi) &soc_component_dev_wm0010, wm0010_dai, ARRAY_SIZE(wm0010_dai)); if (ret < 0) - return ret; + goto disable_irq_wake; return 0; + +disable_irq_wake: + irq_set_irq_wake(wm0010->irq, 0); + +free_irq: + if (wm0010->irq) + free_irq(wm0010->irq, wm0010); + + return ret; } static void wm0010_spi_remove(struct spi_device *spi) -- GitLab From 658fb7fe8e7f4014ea17a4da0e0c1d9bc319fa35 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 5 Mar 2025 18:27:32 +0100 Subject: [PATCH 989/989] ASoC: cs42l43: convert to SYSTEM_SLEEP_PM_OPS The custom suspend function causes a build warning when CONFIG_PM_SLEEP is disabled: sound/soc/codecs/cs42l43.c:2405:12: error: unused function 'cs42l43_codec_runtime_force_suspend' [-Werror,-Wunused-function] Change SET_SYSTEM_SLEEP_PM_OPS() to the newer SYSTEM_SLEEP_PM_OPS(), to avoid this. Fixes: 164b7dd4546b ("ASoC: cs42l43: Add jack delay debounce after suspend") Signed-off-by: Arnd Bergmann Reviewed-by: Maciej Strozek Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20250305172738.3437513-1-arnd@kernel.org Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/cs42l43.c b/sound/soc/codecs/cs42l43.c index d307b56a7f38e..ea84ac64c775e 100644 --- a/sound/soc/codecs/cs42l43.c +++ b/sound/soc/codecs/cs42l43.c @@ -2417,7 +2417,7 @@ static int cs42l43_codec_runtime_force_suspend(struct device *dev) static const struct dev_pm_ops cs42l43_codec_pm_ops = { RUNTIME_PM_OPS(NULL, cs42l43_codec_runtime_resume, NULL) - SET_SYSTEM_SLEEP_PM_OPS(cs42l43_codec_runtime_force_suspend, pm_runtime_force_resume) + SYSTEM_SLEEP_PM_OPS(cs42l43_codec_runtime_force_suspend, pm_runtime_force_resume) }; static const struct platform_device_id cs42l43_codec_id_table[] = { -- GitLab