From 1f1dc442c57ec61c08d21d47e4c5b4f16446fe00 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 22 Mar 2024 14:10:26 -0700 Subject: mshyperv: Introduce hv_numa_node_to_pxm_info() Factor out logic for converting numa node to hv_proximity_domain_info into a helper function. Change hv_proximity_domain_info to a struct to improve readability. While at it, rename hv_add_logical_processor_* structs to the correct hv_input_/hv_output_ prefix, and remove the flags field which is not present in the ABI. Signed-off-by: Nuno Das Neves Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1711141826-9458-1-git-send-email-nunodasneves@linux.microsoft.com> --- include/asm-generic/hyperv-tlfs.h | 19 +++++++------------ include/asm-generic/mshyperv.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec..69f3f68dc249 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -512,13 +512,9 @@ struct hv_proximity_domain_flags { u32 proximity_info_valid : 1; } __packed; -/* Not a union in windows but useful for zeroing */ -union hv_proximity_domain_info { - struct { - u32 domain_id; - struct hv_proximity_domain_flags flags; - }; - u64 as_uint64; +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; } __packed; struct hv_lp_startup_status { @@ -532,14 +528,13 @@ struct hv_lp_startup_status { } __packed; /* HvAddLogicalProcessor hypercall */ -struct hv_add_logical_processor_in { +struct hv_input_add_logical_processor { u32 lp_index; u32 apic_id; - union hv_proximity_domain_info proximity_domain_info; - u64 flags; + struct hv_proximity_domain_info proximity_domain_info; } __packed; -struct hv_add_logical_processor_out { +struct hv_output_add_logical_processor { struct hv_lp_startup_status startup_status; } __packed; @@ -560,7 +555,7 @@ struct hv_create_vp { u8 padding[3]; u8 subnode_type; u64 subnode_id; - union hv_proximity_domain_info proximity_domain_info; + struct hv_proximity_domain_info proximity_domain_info; u64 flags; } __packed; diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 430f0ae0dde2..cfb0b51a0998 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,19 @@ extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); bool hv_isolation_type_snp(void); bool hv_isolation_type_tdx(void); +static inline struct hv_proximity_domain_info hv_numa_node_to_pxm_info(int node) +{ + struct hv_proximity_domain_info pxm_info = {}; + + if (node != NUMA_NO_NODE) { + pxm_info.domain_id = node_to_pxm(node); + pxm_info.flags.proximity_info_valid = 1; + pxm_info.flags.proximity_preferred = 1; + } + + return pxm_info; +} + /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) { -- cgit v1.2.3 From f9eeb6bb13fb5d7af1ea5b74a10b1f8ead962540 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 21 Mar 2024 15:08:13 +0200 Subject: ALSA: hda: Add pplcllpl/u members to hdac_ext_stream The pplcllpl/u can be used to save the Link Connection Linear Link Position register value to be used for compensation of the LLP register value in case the counter is not reset (after pause/resume or stop/start without closing the stream). The LLP can be used along with PPHCLDP to calculate delay caused by the DSP processing for HDA links. Cc: stable@vger.kernel.org # 6.8 Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Pierre-Louis Bossart Link: https://msgid.link/r/20240321130814.4412-17-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- include/sound/hdaudio_ext.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/hdaudio_ext.h b/include/sound/hdaudio_ext.h index a8bebac1e4b2..957295364a5e 100644 --- a/include/sound/hdaudio_ext.h +++ b/include/sound/hdaudio_ext.h @@ -56,6 +56,9 @@ struct hdac_ext_stream { u32 pphcldpl; u32 pphcldpu; + u32 pplcllpl; + u32 pplcllpu; + bool decoupled:1; bool link_locked:1; bool link_prepared; -- cgit v1.2.3 From e89c928bedd77d181edc2df01cb6672184775140 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 5 Mar 2024 18:48:39 +0000 Subject: KVM: arm64: Fix host-programmed guest events in nVHE Programming PMU events in the host that count during guest execution is a feature supported by perf, e.g. perf stat -e cpu_cycles:G ./lkvm run While this works for VHE, the guest/host event bitmaps are not carried through to the hypervisor in the nVHE configuration. Make kvm_pmu_update_vcpu_events() conditional on whether or not _hardware_ supports PMUv3 rather than if the vCPU as vPMU enabled. Cc: stable@vger.kernel.org Fixes: 84d751a019a9 ("KVM: arm64: Pass pmu events to hyp via vcpu") Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20240305184840.636212-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_pmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index eb4c369a79eb..35d4ca4f6122 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -86,7 +86,7 @@ void kvm_vcpu_pmu_resync_el0(void); */ #define kvm_pmu_update_vcpu_events(vcpu) \ do { \ - if (!has_vhe() && kvm_vcpu_has_pmu(vcpu)) \ + if (!has_vhe() && kvm_arm_support_pmu_v3()) \ vcpu->arch.pmu.events = *kvm_get_pmu_events(); \ } while (0) -- cgit v1.2.3 From 0462c56c290a99a7f03e817ae5b843116dfb575c Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 16:21:25 +0100 Subject: driver core: Introduce device_link_wait_removal() The commit 80dd33cf72d1 ("drivers: base: Fix device link removal") introduces a workqueue to release the consumer and supplier devices used in the devlink. In the job queued, devices are release and in turn, when all the references to these devices are dropped, the release function of the device itself is called. Nothing is present to provide some synchronisation with this workqueue in order to ensure that all ongoing releasing operations are done and so, some other operations can be started safely. For instance, in the following sequence: 1) of_platform_depopulate() 2) of_overlay_remove() During the step 1, devices are released and related devlinks are removed (jobs pushed in the workqueue). During the step 2, OF nodes are destroyed but, without any synchronisation with devlink removal jobs, of_overlay_remove() can raise warnings related to missing of_node_put(): ERROR: memory leak, expected refcount 1 instead of 2 Indeed, the missing of_node_put() call is going to be done, too late, from the workqueue job execution. Introduce device_link_wait_removal() to offer a way to synchronize operations waiting for the end of devlink removals (i.e. end of workqueue jobs). Also, as a flushing operation is done on the workqueue, the workqueue used is moved from a system-wide workqueue to a local one. Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Tested-by: Luca Ceresoli Reviewed-by: Nuno Sa Reviewed-by: Saravana Kannan Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240325152140.198219-2-herve.codina@bootlin.com Signed-off-by: Rob Herring --- drivers/base/core.c | 26 +++++++++++++++++++++++--- include/linux/device.h | 1 + 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/base/core.c b/drivers/base/core.c index b93f3c5716ae..5f4e03336e68 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -44,6 +44,7 @@ static bool fw_devlink_is_permissive(void); static void __fw_devlink_link_to_consumers(struct device *dev); static bool fw_devlink_drv_reg_done; static bool fw_devlink_best_effort; +static struct workqueue_struct *device_link_wq; /** * __fwnode_link_add - Create a link between two fwnode_handles. @@ -533,12 +534,26 @@ static void devlink_dev_release(struct device *dev) /* * It may take a while to complete this work because of the SRCU * synchronization in device_link_release_fn() and if the consumer or - * supplier devices get deleted when it runs, so put it into the "long" - * workqueue. + * supplier devices get deleted when it runs, so put it into the + * dedicated workqueue. */ - queue_work(system_long_wq, &link->rm_work); + queue_work(device_link_wq, &link->rm_work); } +/** + * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate + */ +void device_link_wait_removal(void) +{ + /* + * devlink removal jobs are queued in the dedicated work queue. + * To be sure that all removal jobs are terminated, ensure that any + * scheduled work has run to completion. + */ + flush_workqueue(device_link_wq); +} +EXPORT_SYMBOL_GPL(device_link_wait_removal); + static struct class devlink_class = { .name = "devlink", .dev_groups = devlink_groups, @@ -4164,9 +4179,14 @@ int __init devices_init(void) sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj); if (!sysfs_dev_char_kobj) goto char_kobj_err; + device_link_wq = alloc_workqueue("device_link_wq", 0, 0); + if (!device_link_wq) + goto wq_err; return 0; + wq_err: + kobject_put(sysfs_dev_char_kobj); char_kobj_err: kobject_put(sysfs_dev_block_kobj); block_kobj_err: diff --git a/include/linux/device.h b/include/linux/device.h index 97c4b046c09d..b9f5464f44ed 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1247,6 +1247,7 @@ void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); +void device_link_wait_removal(void); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ -- cgit v1.2.3 From ddd65e19c60140673ea9f7249af0a672f1820623 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 23 Mar 2024 17:11:19 +0100 Subject: block: handle BLK_OPEN_RESTRICT_WRITES correctly Last kernel release we introduce CONFIG_BLK_DEV_WRITE_MOUNTED. By default this option is set. When it is set the long-standing behavior of being able to write to mounted block devices is enabled. But in order to guard against unintended corruption by writing to the block device buffer cache CONFIG_BLK_DEV_WRITE_MOUNTED can be turned off. In that case it isn't possible to write to mounted block devices anymore. A filesystem may open its block devices with BLK_OPEN_RESTRICT_WRITES which disallows concurrent BLK_OPEN_WRITE access. When we still had the bdev handle around we could recognize BLK_OPEN_RESTRICT_WRITES because the mode was passed around. Since we managed to get rid of the bdev handle we changed that logic to recognize BLK_OPEN_RESTRICT_WRITES based on whether the file was opened writable and writes to that block device are blocked. That logic doesn't work because we do allow BLK_OPEN_RESTRICT_WRITES to be specified without BLK_OPEN_WRITE. Fix the detection logic and use an FMODE_* bit. We could've also abused O_EXCL as an indicator that BLK_OPEN_RESTRICT_WRITES has been requested. For userspace open paths O_EXCL will never be retained but for internal opens where we open files that are never installed into a file descriptor table this is fine. But it would be a gamble that this doesn't cause bugs. Note that BLK_OPEN_RESTRICT_WRITES is an internal only flag that cannot directly be raised by userspace. It is implicitly raised during mounting. Passes xftests and blktests with CONFIG_BLK_DEV_WRITE_MOUNTED set and unset. Link: https://lore.kernel.org/r/ZfyyEwu9Uq5Pgb94@casper.infradead.org Link: https://lore.kernel.org/r/20240323-zielbereich-mittragen-6fdf14876c3e@brauner Fixes: 321de651fa56 ("block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access") Reviewed-by: Yu Kuai Reviewed-by: Jan Kara Reported-by: Matthew Wilcox Signed-off-by: Christian Brauner --- block/bdev.c | 14 +++++++------- include/linux/fs.h | 2 ++ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e..ab9cae0e05f1 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -821,13 +821,11 @@ static void bdev_yield_write_access(struct file *bdev_file) return; bdev = file_bdev(bdev_file); - /* Yield exclusive or shared write access. */ - if (bdev_file->f_mode & FMODE_WRITE) { - if (bdev_writes_blocked(bdev)) - bdev_unblock_writes(bdev); - else - bdev->bd_writers--; - } + + if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) + bdev_unblock_writes(bdev); + else if (bdev_file->f_mode & FMODE_WRITE) + bdev->bd_writers--; } /** @@ -907,6 +905,8 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; if (bdev_nowait(bdev)) bdev_file->f_mode |= FMODE_NOWAIT; + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; bdev_file->f_mapping = bdev->bd_inode->i_mapping; bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping); bdev_file->private_data = holder; diff --git a/include/linux/fs.h b/include/linux/fs.h index 00fc429b0af0..8dfd53b52744 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -121,6 +121,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) +/* File writes are restricted (block device specific) */ +#define FMODE_WRITE_RESTRICTED ((__force fmode_t)0x40) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ -- cgit v1.2.3 From 22650a99821dda3d05f1c334ea90330b4982de56 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Mar 2024 13:47:22 +0100 Subject: fs,block: yield devices early Currently a device is only really released once the umount returns to userspace due to how file closing works. That ultimately could cause an old umount assumption to be violated that concurrent umount and mount don't fail. So an exclusively held device with a temporary holder should be yielded before the filesystem is gone. Add a helper that allows callers to do that. This also allows us to remove the two holder ops that Linus wasn't excited about. Link: https://lore.kernel.org/r/20240326-vfs-bdev-end_holder-v1-1-20af85202918@kernel.org Fixes: f3a608827d1f ("bdev: open block device as files") # mainline only Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- block/bdev.c | 64 +++++++++++++++++++++++++++++++++++------ drivers/mtd/devices/block2mtd.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/cramfs/inode.c | 2 +- fs/ext4/super.c | 8 +++--- fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 4 +-- fs/reiserfs/journal.c | 2 +- fs/romfs/super.c | 2 +- fs/super.c | 24 ++-------------- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_super.c | 6 ++-- include/linux/blkdev.h | 11 +------ 13 files changed, 76 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index a1946a902df3..b8e32d933a63 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -583,9 +583,6 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder, mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); - - if (hops && hops->get_holder) - hops->get_holder(holder); } /** @@ -608,7 +605,6 @@ EXPORT_SYMBOL(bd_abort_claiming); static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); - const struct blk_holder_ops *hops = bdev->bd_holder_ops; bool unblock = false; /* @@ -631,9 +627,6 @@ static void bd_end_claim(struct block_device *bdev, void *holder) whole->bd_holder = NULL; mutex_unlock(&bdev_lock); - if (hops && hops->put_holder) - hops->put_holder(holder); - /* * If this was the last claim, remove holder link and unblock evpoll if * it was a write holder. @@ -813,6 +806,11 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) bdev->bd_writers++; } +static inline bool bdev_unclaimed(const struct file *bdev_file) +{ + return bdev_file->private_data == BDEV_I(bdev_file->f_mapping->host); +} + static void bdev_yield_write_access(struct file *bdev_file) { struct block_device *bdev; @@ -820,6 +818,9 @@ static void bdev_yield_write_access(struct file *bdev_file) if (bdev_allow_write_mounted) return; + if (bdev_unclaimed(bdev_file)) + return; + bdev = file_bdev(bdev_file); if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) @@ -1012,6 +1013,20 @@ struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_file_open_by_path); +static inline void bd_yield_claim(struct file *bdev_file) +{ + struct block_device *bdev = file_bdev(bdev_file); + void *holder = bdev_file->private_data; + + lockdep_assert_held(&bdev->bd_disk->open_mutex); + + if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) + return; + + if (!bdev_unclaimed(bdev_file)) + bd_end_claim(bdev, holder); +} + void bdev_release(struct file *bdev_file) { struct block_device *bdev = file_bdev(bdev_file); @@ -1036,7 +1051,7 @@ void bdev_release(struct file *bdev_file) bdev_yield_write_access(bdev_file); if (holder) - bd_end_claim(bdev, holder); + bd_yield_claim(bdev_file); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -1056,6 +1071,39 @@ put_no_open: blkdev_put_no_open(bdev); } +/** + * bdev_fput - yield claim to the block device and put the file + * @bdev_file: open block device + * + * Yield claim on the block device and put the file. Ensure that the + * block device can be reclaimed before the file is closed which is a + * deferred operation. + */ +void bdev_fput(struct file *bdev_file) +{ + if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) + return; + + if (bdev_file->private_data) { + struct block_device *bdev = file_bdev(bdev_file); + struct gendisk *disk = bdev->bd_disk; + + mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev_file); + bd_yield_claim(bdev_file); + /* + * Tell release we already gave up our hold on the + * device and if write restrictions are available that + * we already gave up write access to the device. + */ + bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host); + mutex_unlock(&disk->open_mutex); + } + + fput(bdev_file); +} +EXPORT_SYMBOL(bdev_fput); + /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 97a00ec9a4d4..caacdc0a3819 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -209,7 +209,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->bdev_file) { invalidate_mapping_pages(dev->bdev_file->f_mapping, 0, -1); - fput(dev->bdev_file); + bdev_fput(dev->bdev_file); } kfree(dev); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index ad28e370b640..cb7b4de11a49 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -143,7 +143,7 @@ void bch2_free_super(struct bch_sb_handle *sb) { kfree(sb->bio); if (!IS_ERR_OR_NULL(sb->s_bdev_file)) - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); kfree(sb->holder); kfree(sb->sb_name); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 39e75131fd5a..9901057a15ba 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } kfree(sbi); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cfb8449c731f..044135796f2b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5668,7 +5668,7 @@ failed_mount: brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); - fput(sbi->s_journal_bdev_file); + bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5913,7 +5913,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, out_bh: brelse(bh); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -5952,7 +5952,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, out_journal: jbd2_journal_destroy(journal); out_bdev: - fput(bdev_file); + bdev_fput(bdev_file); return ERR_PTR(errno); } @@ -7327,7 +7327,7 @@ static void ext4_kill_sb(struct super_block *sb) kill_block_super(sb); if (bdev_file) - fput(bdev_file); + bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a6867f26f141..a4bc26dfdb1a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1558,7 +1558,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - fput(FDEV(i).bdev_file); + bdev_fput(FDEV(i).bdev_file); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 73389c68e251..9609349e92e5 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - fput(bdev_file); + bdev_fput(bdev_file); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1485,7 +1485,7 @@ int lmLogClose(struct super_block *sb) bdev_file = log->bdev_file; rc = lmLogShutdown(log); - fput(bdev_file); + bdev_fput(bdev_file); kfree(log); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 6474529c4253..e539ccd39e1e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,7 @@ static void journal_list_init(struct super_block *sb) static void release_journal_dev(struct reiserfs_journal *journal) { if (journal->j_bdev_file) { - fput(journal->j_bdev_file); + bdev_fput(journal->j_bdev_file); journal->j_bdev_file = NULL; } } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2be227532f39..2cbb92462074 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } #endif } diff --git a/fs/super.c b/fs/super.c index 71d9779c42b1..69ce6c600968 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1515,29 +1515,11 @@ static int fs_bdev_thaw(struct block_device *bdev) return error; } -static void fs_bdev_super_get(void *data) -{ - struct super_block *sb = data; - - spin_lock(&sb_lock); - sb->s_count++; - spin_unlock(&sb_lock); -} - -static void fs_bdev_super_put(void *data) -{ - struct super_block *sb = data; - - put_super(sb); -} - const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, - .get_holder = fs_bdev_super_get, - .put_holder = fs_bdev_super_put, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1562,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - fput(bdev_file); + bdev_fput(bdev_file); return -EACCES; } @@ -1573,7 +1555,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - fput(bdev_file); + bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); @@ -1693,7 +1675,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - fput(sb->s_bdev_file); + bdev_fput(sb->s_bdev_file); } } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a18c381127e..f0fa02264eda 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2030,7 +2030,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - fput(btp->bt_bdev_file); + bdev_fput(btp->bt_bdev_file); kfree(btp); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c21f10ab0f5d..bce020374c5e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -485,7 +485,7 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); } return 0; @@ -497,10 +497,10 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: if (rtdev_file) - fput(rtdev_file); + bdev_fput(rtdev_file); out_close_logdev: if (logdev_file) - fput(logdev_file); + bdev_fput(logdev_file); return error; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..172c91879999 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1505,16 +1505,6 @@ struct blk_holder_ops { * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); - - /* - * If needed, get a reference to the holder. - */ - void (*get_holder)(void *holder); - - /* - * Release the holder. - */ - void (*put_holder)(void *holder); }; /* @@ -1585,6 +1575,7 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct request *req_list; -- cgit v1.2.3 From 7f1dd39aedfccf60772328c5b88d56dbd39954c3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Wed, 27 Mar 2024 08:33:10 +0100 Subject: clk: Provide !COMMON_CLK dummy for devm_clk_rate_exclusive_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to compile drivers using devm_clk_rate_exclusive_get() also on platforms without the common clk framework, add a dummy implementation that does the same as clk_rate_exclusive_get() in that case (i.e. nothing). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403270305.ydvX9xq1-lkp@intel.com/ Fixes: b0cde62e4c54 ("clk: Add a devm variant of clk_rate_exclusive_get()") Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240327073310.520950-2-u.kleine-koenig@pengutronix.de Signed-off-by: Stephen Boyd --- include/linux/clk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/clk.h b/include/linux/clk.h index 00623f4de5e1..0fa56d672532 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -286,6 +286,11 @@ static inline int clk_rate_exclusive_get(struct clk *clk) return 0; } +static inline int devm_clk_rate_exclusive_get(struct device *dev, struct clk *clk) +{ + return 0; +} + static inline void clk_rate_exclusive_put(struct clk *clk) {} #endif -- cgit v1.2.3 From 1a80dbcb2dbaf6e4c216e62e30fa7d3daa8001ce Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 27 Mar 2024 22:24:26 -0700 Subject: bpf: support deferring bpf_link dealloc to after RCU grace period BPF link for some program types is passed as a "context" which can be used by those BPF programs to look up additional information. E.g., for multi-kprobes and multi-uprobes, link is used to fetch BPF cookie values. Because of this runtime dependency, when bpf_link refcnt drops to zero there could still be active BPF programs running accessing link data. This patch adds generic support to defer bpf_link dealloc callback to after RCU GP, if requested. This is done by exposing two different deallocation callbacks, one synchronous and one deferred. If deferred one is provided, bpf_link_free() will schedule dealloc_deferred() callback to happen after RCU GP. BPF is using two flavors of RCU: "classic" non-sleepable one and RCU tasks trace one. The latter is used when sleepable BPF programs are used. bpf_link_free() accommodates that by checking underlying BPF program's sleepable flag, and goes either through normal RCU GP only for non-sleepable, or through RCU tasks trace GP *and* then normal RCU GP (taking into account rcu_trace_implies_rcu_gp() optimization), if BPF program is sleepable. We use this for multi-kprobe and multi-uprobe links, which dereference link during program run. We also preventively switch raw_tp link to use deferred dealloc callback, as upcoming changes in bpf-next tree expose raw_tp link data (specifically, cookie value) to BPF program at runtime as well. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Reported-by: syzbot+981935d9485a560bfbcb@syzkaller.appspotmail.com Reported-by: syzbot+2cb5a6c573e98db598cc@syzkaller.appspotmail.com Reported-by: syzbot+62d8b26793e8a2bd0516@syzkaller.appspotmail.com Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20240328052426.3042617-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++++- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 49 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f20f62f9d63..890e152d553e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1574,12 +1574,26 @@ struct bpf_link { enum bpf_link_type type; const struct bpf_link_ops *ops; struct bpf_prog *prog; - struct work_struct work; + /* rcu is used before freeing, work can be used to schedule that + * RCU-based freeing before that, so they never overlap + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; }; struct bpf_link_ops { void (*release)(struct bpf_link *link); + /* deallocate link resources callback, called without RCU grace period + * waiting + */ void (*dealloc)(struct bpf_link *link); + /* deallocate link resources callback, called after RCU grace period; + * if underlying BPF program is sleepable we go through tasks trace + * RCU GP and then "classic" RCU GP + */ + void (*dealloc_deferred)(struct bpf_link *link); int (*detach)(struct bpf_link *link); int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ae2ff73bde7e..c287925471f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3024,17 +3024,46 @@ void bpf_link_inc(struct bpf_link *link) atomic64_inc(&link->refcnt); } +static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) +{ + struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); + + /* free bpf_link and its containing memory */ + link->ops->dealloc_deferred(link); +} + +static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_link_defer_dealloc_rcu_gp(rcu); + else + call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); +} + /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { + bool sleepable = false; + bpf_link_free_id(link->id); if (link->prog) { + sleepable = link->prog->sleepable; /* detach BPF program, clean up used resources */ link->ops->release(link); bpf_prog_put(link->prog); } - /* free bpf_link and its containing memory */ - link->ops->dealloc(link); + if (link->ops->dealloc_deferred) { + /* schedule BPF link deallocation; if underlying BPF program + * is sleepable, we need to first wait for RCU tasks trace + * sync, then go through "classic" RCU grace period + */ + if (sleepable) + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + else + call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + } + if (link->ops->dealloc) + link->ops->dealloc(link); } static void bpf_link_put_deferred(struct work_struct *work) @@ -3544,7 +3573,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, - .dealloc = bpf_raw_tp_link_dealloc, + .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0b73fe5f7206..9dc605f08a23 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2728,7 +2728,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -3242,7 +3242,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; -- cgit v1.2.3 From 831ec5e3538e989c7995137b5c5c661991a09504 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Thu, 28 Mar 2024 23:47:37 +0100 Subject: ASoC: tas2781: mark dvc_tlv with __maybe_unused Since we put dvc_tlv static variable to a header file it's copied to each module that includes the header. But not all of them are actually used it. Fix this W=1 build warning: include/sound/tas2781-tlv.h:18:35: warning: 'dvc_tlv' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403290354.v0StnRpc-lkp@intel.com/ Fixes: ae065d0ce9e3 ("ALSA: hda/tas2781: remove digital gain kcontrol") Signed-off-by: Gergo Koteles Message-ID: <0e461545a2a6e9b6152985143e50526322e5f76b.1711665731.git.soyer@irl.hu> Signed-off-by: Takashi Iwai --- include/sound/tas2781-tlv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h index 4038dd421150..1dc59005d241 100644 --- a/include/sound/tas2781-tlv.h +++ b/include/sound/tas2781-tlv.h @@ -15,7 +15,7 @@ #ifndef __TAS2781_TLV_H__ #define __TAS2781_TLV_H__ -static const DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); +static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0); static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0); #endif -- cgit v1.2.3 From 3d010c8031e39f5fa1e8b13ada77e0321091011f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 26 Mar 2024 12:33:58 +0100 Subject: udp: do not accept non-tunnel GSO skbs landing in a tunnel When rx-udp-gro-forwarding is enabled UDP packets might be GROed when being forwarded. If such packets might land in a tunnel this can cause various issues and udp_gro_receive makes sure this isn't the case by looking for a matching socket. This is performed in udp4/6_gro_lookup_skb but only in the current netns. This is an issue with tunneled packets when the endpoint is in another netns. In such cases the packets will be GROed at the UDP level, which leads to various issues later on. The same thing can happen with rx-gro-list. We saw this with geneve packets being GROed at the UDP level. In such case gso_size is set; later the packet goes through the geneve rx path, the geneve header is pulled, the offset are adjusted and frag_list skbs are not adjusted with regard to geneve. When those skbs hit skb_fragment, it will misbehave. Different outcomes are possible depending on what the GROed skbs look like; from corrupted packets to kernel crashes. One example is a BUG_ON[1] triggered in skb_segment while processing the frag_list. Because gso_size is wrong (geneve header was pulled) skb_segment thinks there is "geneve header size" of data in frag_list, although it's in fact the next packet. The BUG_ON itself has nothing to do with the issue. This is only one of the potential issues. Looking up for a matching socket in udp_gro_receive is fragile: the lookup could be extended to all netns (not speaking about performances) but nothing prevents those packets from being modified in between and we could still not find a matching socket. It's OK to keep the current logic there as it should cover most cases but we also need to make sure we handle tunnel packets being GROed too early. This is done by extending the checks in udp_unexpected_gso: GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits and landing in a tunnel must be segmented. [1] kernel BUG at net/core/skbuff.c:4408! RIP: 0010:skb_segment+0xd2a/0xf70 __udp_gso_segment+0xaa/0x560 Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") Signed-off-by: Antoine Tenart Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 28 ++++++++++++++++++++++++++++ net/ipv4/udp.c | 7 +++++++ net/ipv4/udp_offload.c | 6 ++++-- net/ipv6/udp.c | 2 +- 4 files changed, 40 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 3748e82b627b..17539d089666 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, } } +DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); +#if IS_ENABLED(CONFIG_IPV6) +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +#endif + +static inline bool udp_encap_needed(void) +{ + if (static_branch_unlikely(&udp_encap_needed_key)) + return true; + +#if IS_ENABLED(CONFIG_IPV6) + if (static_branch_unlikely(&udpv6_encap_needed_key)) + return true; +#endif + + return false; +} + static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) @@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; + /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still + * land in a tunnel as the socket check in udp_gro_receive cannot be + * foolproof. + */ + if (udp_encap_needed() && + READ_ONCE(udp_sk(sk)->encap_rcv) && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM))) + return true; + return false; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 661d0e0d273f..c02bf011d4a6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +EXPORT_SYMBOL(udp_encap_needed_key); + +#if IS_ENABLED(CONFIG_IPV6) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +EXPORT_SYMBOL(udpv6_encap_needed_key); +#endif + void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b9880743765c..e9719afe91cf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -551,8 +551,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - /* we can do L4 aggregation only if the packet can't land in a tunnel - * otherwise we could corrupt the inner stream + /* We can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream. Detecting such packets + * cannot be foolproof and the aggregation might still happen in some + * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7c1e6469d091..8b1dd7f51249 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -447,7 +447,7 @@ csum_copy_err: goto try_again; } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { static_branch_inc(&udpv6_encap_needed_key); -- cgit v1.2.3 From 39646f29b100566451d37abc4cc8cdd583756dfe Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 20 Mar 2024 08:55:53 +0100 Subject: Bluetooth: add quirk for broken address properties Some Bluetooth controllers lack persistent storage for the device address and instead one can be provided by the boot firmware using the 'local-bd-address' devicetree property. The Bluetooth devicetree bindings clearly states that the address should be specified in little-endian order, but due to a long-standing bug in the Qualcomm driver which reversed the address some boot firmware has been providing the address in big-endian order instead. Add a new quirk that can be set on platforms with broken firmware and use it to reverse the address when parsing the property so that the underlying driver bug can be fixed. Fixes: 5c0a1001c8be ("Bluetooth: hci_qca: Add helper to set device address") Cc: stable@vger.kernel.org # 5.1 Reviewed-by: Douglas Anderson Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 9 +++++++++ net/bluetooth/hci_sync.c | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8701ca5f31ee..5c12761cbc0e 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -176,6 +176,15 @@ enum { */ HCI_QUIRK_USE_BDADDR_PROPERTY, + /* When this quirk is set, the Bluetooth Device Address provided by + * the 'local-bd-address' fwnode property is incorrectly specified in + * big-endian order. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BDADDR_PROPERTY_BROKEN, + /* When this quirk is set, the duplicate filtering during * scanning is based on Bluetooth devices addresses. To allow * RSSI based updates, restart scanning if needed. diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index f6b662369322..639090b9f4b8 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3416,7 +3416,10 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - bacpy(&hdev->public_addr, &ba); + if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + baswap(&hdev->public_addr, &ba); + else + bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { -- cgit v1.2.3 From aa7cbefe65e455178c33eca308349e687d262ea7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:47 -0700 Subject: time/timecounter: Fix inline documentation Fix kernel-doc warnings, text punctuation, and a kernel-doc marker (change '%' to '&' to indicate a struct): timecounter.h:72: warning: No description found for return value of 'cyclecounter_cyc2ns' timecounter.h:85: warning: Function parameter or member 'tc' not described in 'timecounter_adjtime' timecounter.h:111: warning: No description found for return value of 'timecounter_read' timecounter.h:128: warning: No description found for return value of 'timecounter_cyc2time' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-2-rdunlap@infradead.org --- include/linux/timecounter.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h index c6540ceea143..0982d1d52b24 100644 --- a/include/linux/timecounter.h +++ b/include/linux/timecounter.h @@ -22,7 +22,7 @@ * * @read: returns the current cycle value * @mask: bitmask for two's complement - * subtraction of non 64 bit counters, + * subtraction of non-64-bit counters, * see CYCLECOUNTER_MASK() helper macro * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) @@ -35,7 +35,7 @@ struct cyclecounter { }; /** - * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds + * struct timecounter - layer above a &struct cyclecounter which counts nanoseconds * Contains the state needed by timecounter_read() to detect * cycle counter wrap around. Initialize with * timecounter_init(). Also used to convert cycle counts into the @@ -66,6 +66,8 @@ struct timecounter { * @cycles: Cycles * @mask: bit mask for maintaining the 'frac' field * @frac: pointer to storage for the fractional nanoseconds. + * + * Returns: cycle counter cycles converted to nanoseconds */ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, u64 cycles, u64 mask, u64 *frac) @@ -79,6 +81,7 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc, /** * timecounter_adjtime - Shifts the time of the clock. + * @tc: The &struct timecounter to adjust * @delta: Desired change in nanoseconds. */ static inline void timecounter_adjtime(struct timecounter *tc, s64 delta) @@ -107,6 +110,8 @@ extern void timecounter_init(struct timecounter *tc, * * In other words, keeps track of time since the same epoch as * the function which generated the initial time stamp. + * + * Returns: nanoseconds since the initial time stamp */ extern u64 timecounter_read(struct timecounter *tc); @@ -123,6 +128,8 @@ extern u64 timecounter_read(struct timecounter *tc); * * This allows conversion of cycle counter values which were generated * in the past. + * + * Returns: cycle counter converted to nanoseconds since the initial time stamp */ extern u64 timecounter_cyc2time(const struct timecounter *tc, u64 cycle_tstamp); -- cgit v1.2.3 From 76f788ee4a7d9f826738a034f9d2ee0bc4cd291b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:48 -0700 Subject: time/timekeeping: Fix kernel-doc warnings and typos Fix punctuation, spellos, and kernel-doc warnings: timekeeping.h:79: warning: No description found for return value of 'ktime_get_real' timekeeping.h:95: warning: No description found for return value of 'ktime_get_boottime' timekeeping.h:108: warning: No description found for return value of 'ktime_get_clocktai' timekeeping.h:149: warning: Function parameter or struct member 'mono' not described in 'ktime_mono_to_real' timekeeping.h:149: warning: No description found for return value of 'ktime_mono_to_real' timekeeping.h:255: warning: Function parameter or struct member 'cs_id' not described in 'system_time_snapshot' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-3-rdunlap@infradead.org --- include/linux/timekeeping.h | 49 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7e50cbd97f86..0ea7823b7f31 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -22,14 +22,14 @@ extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); /* - * ktime_get() family: read the current time in a multitude of ways, + * ktime_get() family - read the current time in a multitude of ways. * * The default time reference is CLOCK_MONOTONIC, starting at * boot time but not counting the time spent in suspend. * For other references, use the functions with "real", "clocktai", * "boottime" and "raw" suffixes. * - * To get the time in a different format, use the ones wit + * To get the time in a different format, use the ones with * "ns", "ts64" and "seconds" suffix. * * See Documentation/core-api/timekeeping.rst for more details. @@ -74,6 +74,8 @@ extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format + * + * Returns: real (wall) time in ktime_t format */ static inline ktime_t ktime_get_real(void) { @@ -86,10 +88,12 @@ static inline ktime_t ktime_get_coarse_real(void) } /** - * ktime_get_boottime - Returns monotonic time since boot in ktime_t format + * ktime_get_boottime - Get monotonic time since boot in ktime_t format * * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the * time spent in suspend. + * + * Returns: monotonic time since boot in ktime_t format */ static inline ktime_t ktime_get_boottime(void) { @@ -102,7 +106,9 @@ static inline ktime_t ktime_get_coarse_boottime(void) } /** - * ktime_get_clocktai - Returns the TAI time of day in ktime_t format + * ktime_get_clocktai - Get the TAI time of day in ktime_t format + * + * Returns: the TAI time of day in ktime_t format */ static inline ktime_t ktime_get_clocktai(void) { @@ -144,32 +150,60 @@ static inline u64 ktime_get_coarse_clocktai_ns(void) /** * ktime_mono_to_real - Convert monotonic time to clock realtime + * @mono: monotonic time to convert + * + * Returns: time converted to realtime clock */ static inline ktime_t ktime_mono_to_real(ktime_t mono) { return ktime_mono_to_any(mono, TK_OFFS_REAL); } +/** + * ktime_get_ns - Get the current time in nanoseconds + * + * Returns: current time converted to nanoseconds + */ static inline u64 ktime_get_ns(void) { return ktime_to_ns(ktime_get()); } +/** + * ktime_get_real_ns - Get the current real/wall time in nanoseconds + * + * Returns: current real time converted to nanoseconds + */ static inline u64 ktime_get_real_ns(void) { return ktime_to_ns(ktime_get_real()); } +/** + * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds + * + * Returns: current boottime converted to nanoseconds + */ static inline u64 ktime_get_boottime_ns(void) { return ktime_to_ns(ktime_get_boottime()); } +/** + * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds + * + * Returns: current TAI time converted to nanoseconds + */ static inline u64 ktime_get_clocktai_ns(void) { return ktime_to_ns(ktime_get_clocktai()); } +/** + * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds + * + * Returns: current raw monotonic time converted to nanoseconds + */ static inline u64 ktime_get_raw_ns(void) { return ktime_to_ns(ktime_get_raw()); @@ -224,8 +258,8 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/* - * struct ktime_timestanps - Simultaneous mono/boot/real timestamps +/** + * struct ktime_timestamps - Simultaneous mono/boot/real timestamps * @mono: Monotonic timestamp * @boot: Boottime timestamp * @real: Realtime timestamp @@ -242,7 +276,8 @@ struct ktime_timestamps { * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time - * @clock_was_set_seq: The sequence number of clock was set events + * @cs_id: Clocksource ID + * @clock_was_set_seq: The sequence number of clock-was-set events * @cs_was_changed_seq: The sequence number of clocksource change events */ struct system_time_snapshot { -- cgit v1.2.3 From b87752ef5cc15b0bae04583d599e873d92dc0618 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 31 Mar 2024 10:26:49 -0700 Subject: timers: Fix kernel-doc format and add Return values Fix kernel-doc format and warnings: timer.h:26: warning: Cannot understand * @TIMER_DEFERRABLE: A deferrable timer will work normally when the on line 26 - I thought it was a doc line timer.h:146: warning: No description found for return value of 'timer_pending' timer.h:180: warning: No description found for return value of 'del_timer_sync' timer.h:193: warning: No description found for return value of 'del_timer' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240331172652.14086-4-rdunlap@infradead.org --- include/linux/timer.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index 14a633ba61d6..e67ecd1cbc97 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -22,7 +22,7 @@ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif -/** +/* * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU @@ -140,7 +140,7 @@ static inline void destroy_timer_on_stack(struct timer_list *timer) { } * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * - * return value: 1 if the timer is pending, 0 if not. + * Returns: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { @@ -175,6 +175,10 @@ extern int timer_shutdown(struct timer_list *timer); * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer_sync(struct timer_list *timer) { @@ -188,6 +192,10 @@ static inline int del_timer_sync(struct timer_list *timer) * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. + * + * Returns: + * * %0 - The timer was not pending + * * %1 - The timer was pending and deactivated */ static inline int del_timer(struct timer_list *timer) { -- cgit v1.2.3 From 09ab7eff38202159271534d2f5ad45526168f2a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Mar 2024 10:45:07 -0600 Subject: io_uring/kbuf: get rid of lower BGID lists Just rely on the xarray for any kind of bgid. This simplifies things, and it really doesn't bring us much, if anything. Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/io_uring.c | 2 -- io_uring/kbuf.c | 70 +++++------------------------------------- 3 files changed, 8 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e24893625085..05df0e399d7c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,7 +294,6 @@ struct io_ring_ctx { struct io_submit_state submit_state; - struct io_buffer_list *io_bl; struct xarray io_bl_xa; struct io_hash_table cancel_table_locked; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d1defb99b89e..bc730f59265f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -351,7 +351,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; @@ -2932,7 +2931,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_napi_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 693c26da4ee1..8bf0121f00af 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -17,8 +17,6 @@ #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -#define BGID_ARRAY 64 - /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) @@ -40,13 +38,9 @@ struct io_buf_free { int inuse; }; -static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, - unsigned int bgid) +static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) { - if (bl && bgid < BGID_ARRAY) - return &bl[bgid]; - return xa_load(&ctx->io_bl_xa, bgid); } @@ -55,7 +49,7 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, ctx->io_bl, bgid); + return __io_buffer_get_list(ctx, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -68,10 +62,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, */ bl->bgid = bgid; smp_store_release(&bl->is_ready, 1); - - if (bgid < BGID_ARRAY) - return 0; - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -208,24 +198,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - int i; - - bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL); - if (!bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&bl[i].buf_list); - bl[i].bgid = i; - } - - smp_store_release(&ctx->io_bl, bl); - return 0; -} - /* * Mark the given mapped range as free for reuse */ @@ -300,13 +272,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct list_head *item, *tmp; struct io_buffer *buf; unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } xa_for_each(&ctx->io_bl_xa, index, bl) { xa_erase(&ctx->io_bl_xa, bl->bgid); @@ -489,12 +454,6 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) io_ring_submit_lock(ctx, issue_flags); - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - bl = io_buffer_get_list(ctx, p->bgid); if (unlikely(!bl)) { bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); @@ -507,14 +466,9 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) if (ret) { /* * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. Also can't - * be a lower indexed array group, as adding one - * where lookup failed cannot happen. + * let's keep it consistent throughout. */ - if (p->bgid >= BGID_ARRAY) - kfree_rcu(bl, rcu); - else - WARN_ON_ONCE(1); + kfree_rcu(bl, rcu); goto err; } } @@ -679,12 +633,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -734,10 +682,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree_rcu(bl, rcu); - } + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree_rcu(bl, rcu); return 0; } @@ -771,7 +717,7 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid); + bl = __io_buffer_get_list(ctx, bgid); if (!bl || !bl->is_mmap) return NULL; -- cgit v1.2.3 From cffaefd15a8f423cdee5d8eac15d267bc92de314 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Mar 2024 19:02:15 +0100 Subject: vdso: Use CONFIG_PAGE_SHIFT in vdso/datapage.h Both the vdso rework and the CONFIG_PAGE_SHIFT changes were merged during the v6.9 merge window, so it is now possible to use CONFIG_PAGE_SHIFT instead of including asm/page.h in the vdso. This avoids the workaround for arm64 - commit 8b3843ae3634 ("vdso/datapage: Quick fix - use asm/page-def.h for ARM64") and addresses a build warning for powerpc64: In file included from :4: In file included from /home/arnd/arm-soc/arm-soc/lib/vdso/gettimeofday.c:5: In file included from ../include/vdso/datapage.h:25: arch/powerpc/include/asm/page.h:230:9: error: result of comparison of constant 13835058055282163712 with expression of type 'unsigned long' is always true [-Werror,-Wtautological-constant-out-of-range-compare] 230 | return __pa(kaddr) >> PAGE_SHIFT; | ^~~~~~~~~~~ arch/powerpc/include/asm/page.h:217:37: note: expanded from macro '__pa' 217 | VIRTUAL_WARN_ON((unsigned long)(x) < PAGE_OFFSET); \ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~ arch/powerpc/include/asm/page.h:202:73: note: expanded from macro 'VIRTUAL_WARN_ON' 202 | #define VIRTUAL_WARN_ON(x) WARN_ON(IS_ENABLED(CONFIG_DEBUG_VIRTUAL) && (x)) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~ arch/powerpc/include/asm/bug.h:88:25: note: expanded from macro 'WARN_ON' 88 | int __ret_warn_on = !!(x); \ | ^ Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Acked-by: Michael Ellerman (powerpc) Link: https://lore.kernel.org/r/20240320180228.136371-1-arnd@kernel.org --- arch/powerpc/include/asm/vdso/gettimeofday.h | 3 +-- include/vdso/datapage.h | 8 +------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index f0a4cf01e85c..78302f6c2580 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -4,7 +4,6 @@ #ifndef __ASSEMBLY__ -#include #include #include #include @@ -95,7 +94,7 @@ const struct vdso_data *__arch_get_vdso_data(void); static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) { - return (void *)vd + PAGE_SIZE; + return (void *)vd + (1U << CONFIG_PAGE_SHIFT); } #endif diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 5d5c0b8efff2..c71ddb6d4691 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -19,12 +19,6 @@ #include #include -#ifdef CONFIG_ARM64 -#include -#else -#include -#endif - #ifdef CONFIG_ARCH_HAS_VDSO_DATA #include #else @@ -132,7 +126,7 @@ extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden */ union vdso_data_store { struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; + u8 page[1U << CONFIG_PAGE_SHIFT]; }; /* -- cgit v1.2.3 From 9c573cd313433f6c1f7236fe64b9b743500c1628 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 9 Mar 2024 12:24:48 -0800 Subject: randomize_kstack: Improve entropy diffusion The kstack_offset variable was really only ever using the low bits for kernel stack offset entropy. Add a ror32() to increase bit diffusion. Suggested-by: Arnd Bergmann Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Link: https://lore.kernel.org/r/20240309202445.work.165-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/randomize_kstack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 5d868505a94e..6d92b68efbf6 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -80,7 +80,7 @@ DECLARE_PER_CPU(u32, kstack_offset); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - offset ^= (rand); \ + offset = ror32(offset, 5) ^ (rand); \ raw_cpu_write(kstack_offset, offset); \ } \ } while (0) -- cgit v1.2.3 From c0de6ab920aafb56feab56058e46b688e694a246 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 2 Apr 2024 12:48:36 -0700 Subject: net: mana: Fix Rx DMA datasize and skb_over_panic mana_get_rxbuf_cfg() aligns the RX buffer's DMA datasize to be multiple of 64. So a packet slightly bigger than mtu+14, say 1536, can be received and cause skb_over_panic. Sample dmesg: [ 5325.237162] skbuff: skb_over_panic: text:ffffffffc043277a len:1536 put:1536 head:ff1100018b517000 data:ff1100018b517100 tail:0x700 end:0x6ea dev: [ 5325.243689] ------------[ cut here ]------------ [ 5325.245748] kernel BUG at net/core/skbuff.c:192! [ 5325.247838] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [ 5325.258374] RIP: 0010:skb_panic+0x4f/0x60 [ 5325.302941] Call Trace: [ 5325.304389] [ 5325.315794] ? skb_panic+0x4f/0x60 [ 5325.317457] ? asm_exc_invalid_op+0x1f/0x30 [ 5325.319490] ? skb_panic+0x4f/0x60 [ 5325.321161] skb_put+0x4e/0x50 [ 5325.322670] mana_poll+0x6fa/0xb50 [mana] [ 5325.324578] __napi_poll+0x33/0x1e0 [ 5325.326328] net_rx_action+0x12e/0x280 As discussed internally, this alignment is not necessary. To fix this bug, remove it from the code. So oversized packets will be marked as CQE_RX_TRUNCATED by NIC, and dropped. Cc: stable@vger.kernel.org Fixes: 2fbbd712baf1 ("net: mana: Enable RX path to handle various MTU sizes") Signed-off-by: Haiyang Zhang Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1712087316-20886-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +- include/net/mana/mana.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 59287c6e6cee..d8af5e7e15b4 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -601,7 +601,7 @@ static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size, *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; - *datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN); + *datasize = mtu + ETH_HLEN; } static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 76147feb0d10..4eeedf14711b 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -39,7 +39,6 @@ enum TRI_STATE { #define COMP_ENTRY_SIZE 64 #define RX_BUFFERS_PER_QUEUE 512 -#define MANA_RX_DATA_ALIGN 64 #define MAX_SEND_BUFFERS_PER_QUEUE 256 -- cgit v1.2.3 From 592447f6cb3c20d606d6c5d8e6af68e99707b786 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 2 Apr 2024 13:27:01 +0000 Subject: memblock tests: fix undefined reference to `BIT' commit 772dd0342727 ("mm: enumerate all gfp flags") define gfp flags with the help of BIT, while gfp_types.h doesn't include header file for the definition. This through an error on building memblock tests. Let's include linux/bits.h to fix it. Signed-off-by: Wei Yang CC: Suren Baghdasaryan CC: Michal Hocko Link: https://lore.kernel.org/r/20240402132701.29744-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/gfp_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 868c8fb1bbc1..13becafe41df 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -2,6 +2,8 @@ #ifndef __LINUX_GFP_TYPES_H #define __LINUX_GFP_TYPES_H +#include + /* The typedef is in types.h but we want the documentation here */ #if 0 /** -- cgit v1.2.3 From bc6f707fc0feec72acc2f49c312eb31d257363a3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 27 Mar 2024 16:43:16 +0100 Subject: x86/cc: Add cc_platform_set/_clear() helpers Add functionality to set and/or clear different attributes of the machine as a confidential computing platform. Add the first one too: whether the machine is running as a host for SEV-SNP guests. Fixes: 216d106c7ff7 ("x86/sev: Add SEV-SNP host initialization support") Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Tested-by: Srikanth Aithal Link: https://lore.kernel.org/r/20240327154317.29909-5-bp@alien8.de --- arch/x86/coco/core.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/cc_platform.h | 12 +++++++++++ 2 files changed, 64 insertions(+) (limited to 'include') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index ddd4efdc79d6..b31ef2424d19 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -20,6 +20,11 @@ enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE; u64 cc_mask __ro_after_init; +static struct cc_attr_flags { + __u64 host_sev_snp : 1, + __resv : 63; +} cc_flags; + static bool noinstr intel_cc_platform_has(enum cc_attr attr) { switch (attr) { @@ -93,6 +98,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_SEV_SNP: return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + case CC_ATTR_HOST_SEV_SNP: + return cc_flags.host_sev_snp; + default: return false; } @@ -153,6 +161,50 @@ u64 cc_mkdec(u64 val) } EXPORT_SYMBOL_GPL(cc_mkdec); +static void amd_cc_platform_clear(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 0; + break; + default: + break; + } +} + +void cc_platform_clear(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_clear(attr); + break; + default: + break; + } +} + +static void amd_cc_platform_set(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_HOST_SEV_SNP: + cc_flags.host_sev_snp = 1; + break; + default: + break; + } +} + +void cc_platform_set(enum cc_attr attr) +{ + switch (cc_vendor) { + case CC_VENDOR_AMD: + amd_cc_platform_set(attr); + break; + default: + break; + } +} + __init void cc_random_init(void) { /* diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index cb0d6cd1c12f..60693a145894 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -90,6 +90,14 @@ enum cc_attr { * Examples include TDX Guest. */ CC_ATTR_HOTPLUG_DISABLED, + + /** + * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. + * + * The host kernel is running with the necessary features + * enabled to run SEV-SNP guests. + */ + CC_ATTR_HOST_SEV_SNP, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM @@ -107,10 +115,14 @@ enum cc_attr { * * FALSE - Specified Confidential Computing attribute is not active */ bool cc_platform_has(enum cc_attr attr); +void cc_platform_set(enum cc_attr attr); +void cc_platform_clear(enum cc_attr attr); #else /* !CONFIG_ARCH_HAS_CC_PLATFORM */ static inline bool cc_platform_has(enum cc_attr attr) { return false; } +static inline void cc_platform_set(enum cc_attr attr) { } +static inline void cc_platform_clear(enum cc_attr attr) { } #endif /* CONFIG_ARCH_HAS_CC_PLATFORM */ -- cgit v1.2.3 From 8130b05c559d1aa83d0c8971b422ba0da18ef24a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 4 Apr 2024 12:42:00 +0200 Subject: PM: EM: fix wrong utilization estimation in em_cpu_energy() Commit 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") has added back map_util_perf() in em_cpu_energy() computation which has been removed with the rework of scheduler/cpufreq interface. This is wrong because sugov_effective_cpu_perf() already takes care of mapping the utilization to a performance level. Fixes: 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") Signed-off-by: Vincent Guittot Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 770755df852f..70cd7258cd29 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -245,7 +245,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * max utilization to the allowed CPU capacity before calculating * effective performance. */ - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); /* -- cgit v1.2.3 From 07ed11afb68d94eadd4ffc082b97c2331307c5ea Mon Sep 17 00:00:00 2001 From: Alex Constantino Date: Thu, 4 Apr 2024 19:14:48 +0100 Subject: Revert "drm/qxl: simplify qxl_fence_wait" This reverts commit 5a838e5d5825c85556011478abde708251cc0776. Changes from commit 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") would result in a '[TTM] Buffer eviction failed' exception whenever it reached a timeout. Due to a dependency to DMA_FENCE_WARN this also restores some code deleted by commit d72277b6c37d ("dma-buf: nuke DMA_FENCE_TRACE macros v2"). Fixes: 5a838e5d5825 ("drm/qxl: simplify qxl_fence_wait") Link: https://lore.kernel.org/regressions/ZTgydqRlK6WX_b29@eldamar.lan/ Reported-by: Timo Lindfors Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1054514 Signed-off-by: Alex Constantino Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20240404181448.1643-2-dreaming.about.electric.sheep@gmail.com --- drivers/gpu/drm/qxl/qxl_release.c | 50 +++++++++++++++++++++++++++++++++++---- include/linux/dma-fence.h | 7 ++++++ 2 files changed, 52 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 368d26da0d6a..9febc8b73f09 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -58,16 +58,56 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, signed long timeout) { struct qxl_device *qdev; + struct qxl_release *release; + int count = 0, sc = 0; + bool have_drawable_releases; unsigned long cur, end = jiffies + timeout; qdev = container_of(fence->lock, struct qxl_device, release_lock); + release = container_of(fence, struct qxl_release, base); + have_drawable_releases = release->type == QXL_RELEASE_DRAWABLE; - if (!wait_event_timeout(qdev->release_event, - (dma_fence_is_signaled(fence) || - (qxl_io_notify_oom(qdev), 0)), - timeout)) - return 0; +retry: + sc++; + + if (dma_fence_is_signaled(fence)) + goto signaled; + + qxl_io_notify_oom(qdev); + + for (count = 0; count < 11; count++) { + if (!qxl_queue_garbage_collect(qdev, true)) + break; + + if (dma_fence_is_signaled(fence)) + goto signaled; + } + + if (dma_fence_is_signaled(fence)) + goto signaled; + + if (have_drawable_releases || sc < 4) { + if (sc > 2) + /* back off */ + usleep_range(500, 1000); + + if (time_after(jiffies, end)) + return 0; + + if (have_drawable_releases && sc > 300) { + DMA_FENCE_WARN(fence, + "failed to wait on release %llu after spincount %d\n", + fence->context & ~0xf0000000, sc); + goto signaled; + } + goto retry; + } + /* + * yeah, original sync_obj_wait gave up after 3 spins when + * have_drawable_releases is not set. + */ +signaled: cur = jiffies; if (time_after(cur, end)) return 0; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e06bad467f55..c3f9bb6602ba 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -682,4 +682,11 @@ static inline bool dma_fence_is_container(struct dma_fence *fence) return dma_fence_is_array(fence) || dma_fence_is_chain(fence); } +#define DMA_FENCE_WARN(f, fmt, args...) \ + do { \ + struct dma_fence *__ff = (f); \ + pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\ + ##args); \ + } while (0) + #endif /* __LINUX_DMA_FENCE_H */ -- cgit v1.2.3 From 65291dcfcf8936e1b23cfd7718fdfde7cfaf7706 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 26 Mar 2024 15:32:08 +0100 Subject: mm/secretmem: fix GUP-fast succeeding on secretmem folios folio_is_secretmem() currently relies on secretmem folios being LRU folios, to save some cycles. However, folios might reside in a folio batch without the LRU flag set, or temporarily have their LRU flag cleared. Consequently, the LRU flag is unreliable for this purpose. In particular, this is the case when secretmem_fault() allocates a fresh page and calls filemap_add_folio()->folio_add_lru(). The folio might be added to the per-cpu folio batch and won't get the LRU flag set until the batch was drained using e.g., lru_add_drain(). Consequently, folio_is_secretmem() might not detect secretmem folios and GUP-fast can succeed in grabbing a secretmem folio, crashing the kernel when we would later try reading/writing to the folio, because the folio has been unmapped from the directmap. Fix it by removing that unreliable check. Link: https://lkml.kernel.org/r/20240326143210.291116-2-david@redhat.com Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: David Hildenbrand Reported-by: xingwei lee Reported-by: yue sun Closes: https://lore.kernel.org/lkml/CABOYnLyevJeravW=QrH0JUPYEcDN160aZFb7kwndm-J2rmz0HQ@mail.gmail.com/ Debugged-by: Miklos Szeredi Tested-by: Miklos Szeredi Reviewed-by: Mike Rapoport (IBM) Cc: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 35f3a4a8ceb1..acf7e1a3f3de 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -13,10 +13,10 @@ static inline bool folio_is_secretmem(struct folio *folio) /* * Using folio_mapping() is quite slow because of the actual call * instruction. - * We know that secretmem pages are not compound and LRU so we can + * We know that secretmem pages are not compound, so we can * save a couple of cycles here. */ - if (folio_test_large(folio) || !folio_test_lru(folio)) + if (folio_test_large(folio)) return false; mapping = (struct address_space *) -- cgit v1.2.3 From a6c1d9cb9a68bfa4512248419c4f4d880d19fe90 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 1 Apr 2024 17:14:58 -0700 Subject: stackdepot: rename pool_index to pool_index_plus_1 Commit 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") changed the meaning of the pool_index field to mean "the pool index plus 1". This made the code accessing this field less self-documenting, as well as causing debuggers such as drgn to not be able to easily remain compatible with both old and new kernels, because they typically do that by testing for presence of the new field. Because stackdepot is a debugging tool, we should make sure that it is debugger friendly. Therefore, give the field a different name to improve readability as well as enabling debugger backwards compatibility. This is needed in 6.9, which would otherwise become an odd release with the new semantics and old name so debuggers wouldn't recognize the new semantics there. Fixes: 3ee34eabac2a ("lib/stackdepot: fix first entry having a 0-handle") Link: https://lkml.kernel.org/r/20240402001500.53533-1-pcc@google.com Link: https://linux-review.googlesource.com/id/Ib3e70c36c1d230dd0a118dc22649b33e768b9f88 Signed-off-by: Peter Collingbourne Acked-by: Vlastimil Babka Reviewed-by: Alexander Potapenko Acked-by: Marco Elver Acked-by: Oscar Salvador Cc: Andrey Konovalov Cc: Michal Hocko Cc: Omar Sandoval Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 7 +++---- lib/stackdepot.c | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 3c6caa5abc7c..e9ec32fb97d4 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -44,10 +44,9 @@ typedef u32 depot_stack_handle_t; union handle_parts { depot_stack_handle_t handle; struct { - /* pool_index is offset by 1 */ - u32 pool_index : DEPOT_POOL_INDEX_BITS; - u32 offset : DEPOT_OFFSET_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 pool_index_plus_1 : DEPOT_POOL_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; diff --git a/lib/stackdepot.c b/lib/stackdepot.c index af6cc19a2003..68c97387aa54 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -330,7 +330,7 @@ static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) stack = current_pool + pool_offset; /* Pre-initialize handle once. */ - stack->handle.pool_index = pool_index + 1; + stack->handle.pool_index_plus_1 = pool_index + 1; stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; stack->handle.extra = 0; INIT_LIST_HEAD(&stack->hash_list); @@ -441,7 +441,7 @@ static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) const int pools_num_cached = READ_ONCE(pools_num); union handle_parts parts = { .handle = handle }; void *pool; - u32 pool_index = parts.pool_index - 1; + u32 pool_index = parts.pool_index_plus_1 - 1; size_t offset = parts.offset << DEPOT_STACK_ALIGN; struct stack_record *stack; -- cgit v1.2.3 From 38a15d0a50e0a43778561a5861403851f0b0194c Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 4 Apr 2024 09:57:40 +0200 Subject: u64_stats: fix u64_stats_init() for lockdep when used repeatedly in one file Fix bogus lockdep warnings if multiple u64_stats_sync variables are initialized in the same file. With CONFIG_LOCKDEP, seqcount_init() is a macro which declares: static struct lock_class_key __key; Since u64_stats_init() is a function (albeit an inline one), all calls within the same file end up using the same instance, effectively treating them all as a single lock-class. Fixes: 9464ca650008 ("net: make u64_stats_init() a function") Closes: https://lore.kernel.org/netdev/ea1567d9-ce66-45e6-8168-ac40a47d1821@roeck-us.net/ Signed-off-by: Petr Tesarik Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240404075740.30682-1-petr@tesarici.cz Signed-off-by: Jakub Kicinski --- include/linux/u64_stats_sync.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index ffe48e69b3f3..457879938fc1 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -135,10 +135,11 @@ static inline void u64_stats_inc(u64_stats_t *p) p->v++; } -static inline void u64_stats_init(struct u64_stats_sync *syncp) -{ - seqcount_init(&syncp->seq); -} +#define u64_stats_init(syncp) \ + do { \ + struct u64_stats_sync *__s = (syncp); \ + seqcount_init(&__s->seq); \ + } while (0) static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -- cgit v1.2.3 From ffe6176b7f53ca0c99355f13e14a33a40cf49406 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 31 Mar 2024 10:43:48 +0200 Subject: virtio: store owner from modules with register_virtio_driver() Modules registering driver with register_virtio_driver() might forget to set .owner field. i2c-virtio.c for example has it missing. The field is used by some other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core virtio code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Fixes: 3cfc88380413 ("i2c: virtio: add a virtio i2c frontend driver") Cc: "Jie Deng" Signed-off-by: Krzysztof Kozlowski Message-Id: <20240331-module-owner-virtio-v2-1-98f04bfaf46a@linaro.org> Signed-off-by: Michael S. Tsirkin --- Documentation/driver-api/virtio/writing_virtio_drivers.rst | 1 - drivers/virtio/virtio.c | 6 ++++-- include/linux/virtio.h | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/driver-api/virtio/writing_virtio_drivers.rst b/Documentation/driver-api/virtio/writing_virtio_drivers.rst index e14c58796d25..e5de6f5d061a 100644 --- a/Documentation/driver-api/virtio/writing_virtio_drivers.rst +++ b/Documentation/driver-api/virtio/writing_virtio_drivers.rst @@ -97,7 +97,6 @@ like this:: static struct virtio_driver virtio_dummy_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtio_dummy_probe, .remove = virtio_dummy_remove, diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index f173587893cb..9510c551dce8 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -362,14 +362,16 @@ static const struct bus_type virtio_bus = { .remove = virtio_dev_remove, }; -int register_virtio_driver(struct virtio_driver *driver) +int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) { /* Catch this early. */ BUG_ON(driver->feature_table_size && !driver->feature_table); driver->driver.bus = &virtio_bus; + driver->driver.owner = owner; + return driver_register(&driver->driver); } -EXPORT_SYMBOL_GPL(register_virtio_driver); +EXPORT_SYMBOL_GPL(__register_virtio_driver); void unregister_virtio_driver(struct virtio_driver *driver) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b0201747a263..26c4325aa373 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -170,7 +170,7 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); /** * struct virtio_driver - operations for a virtio I/O driver - * @driver: underlying device driver (populate name and owner). + * @driver: underlying device driver (populate name). * @id_table: the ids serviced by this driver. * @feature_table: an array of feature numbers supported by this driver. * @feature_table_size: number of entries in the feature table array. @@ -208,7 +208,10 @@ static inline struct virtio_driver *drv_to_virtio(struct device_driver *drv) return container_of(drv, struct virtio_driver, driver); } -int register_virtio_driver(struct virtio_driver *drv); +/* use a macro to avoid include chaining to get THIS_MODULE */ +#define register_virtio_driver(drv) \ + __register_virtio_driver(drv, THIS_MODULE) +int __register_virtio_driver(struct virtio_driver *drv, struct module *owner); void unregister_virtio_driver(struct virtio_driver *drv); /* module_virtio_driver() - Helper macro for drivers that don't do -- cgit v1.2.3 From 2855c2a7820bc8198ae937a9a67dbdc3990e9d2c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 2 Apr 2024 17:21:43 -0400 Subject: vhost-vdpa: change ioctl # for VDPA_GET_VRING_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VDPA_GET_VRING_SIZE by mistake uses the already occupied ioctl # 0x80 and we never noticed - it happens to work because the direction and size are different, but confuses tools such as perf which like to look at just the number, and breaks the extra robustness of the ioctl numbering macros. To fix, sort the entries and renumber the ioctl - not too late since it wasn't in any released kernels yet. Cc: Arnaldo Carvalho de Melo Reported-by: Namhyung Kim Fixes: 1496c47065f9 ("vhost-vdpa: uapi to support reporting per vq size") Cc: "Zhu Lingshan" Signed-off-by: Michael S. Tsirkin Message-Id: <41c1c5489688abe5bfef9f7cf15584e3fb872ac5.1712092759.git.mst@redhat.com> Reviewed-by: Eugenio Pérez Reviewed-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang --- include/uapi/linux/vhost.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bea697390613..b95dd84eef2d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -179,12 +179,6 @@ /* Get the config size */ #define VHOST_VDPA_GET_CONFIG_SIZE _IOR(VHOST_VIRTIO, 0x79, __u32) -/* Get the count of all virtqueues */ -#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) - -/* Get the number of virtqueue groups. */ -#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) - /* Get the number of address spaces. */ #define VHOST_VDPA_GET_AS_NUM _IOR(VHOST_VIRTIO, 0x7A, unsigned int) @@ -228,10 +222,17 @@ #define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ struct vhost_vring_state) + +/* Get the count of all virtqueues */ +#define VHOST_VDPA_GET_VQS_COUNT _IOR(VHOST_VIRTIO, 0x80, __u32) + +/* Get the number of virtqueue groups. */ +#define VHOST_VDPA_GET_GROUP_NUM _IOR(VHOST_VIRTIO, 0x81, __u32) + /* Get the queue size of a specific virtqueue. * userspace set the vring index in vhost_vring_state.index * kernel set the queue size in vhost_vring_state.num */ -#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x80, \ +#define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) #endif -- cgit v1.2.3 From d8a6213d70accb403b82924a1c229e733433a5ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Apr 2024 10:30:34 +0000 Subject: geneve: fix header validation in geneve[6]_xmit_skb syzbot is able to trigger an uninit-value in geneve_xmit() [1] Problem : While most ip tunnel helpers (like ip_tunnel_get_dsfield()) uses skb_protocol(skb, true), pskb_inet_may_pull() is only using skb->protocol. If anything else than ETH_P_IPV6 or ETH_P_IP is found in skb->protocol, pskb_inet_may_pull() does nothing at all. If a vlan tag was provided by the caller (af_packet in the syzbot case), the network header might not point to the correct location, and skb linear part could be smaller than expected. Add skb_vlan_inet_prepare() to perform a complete mac validation. Use this in geneve for the moment, I suspect we need to adopt this more broadly. v4 - Jakub reported v3 broke l2_tos_ttl_inherit.sh selftest - Only call __vlan_get_protocol() for vlan types. Link: https://lore.kernel.org/netdev/20240404100035.3270a7d5@kernel.org/ v2,v3 - Addressed Sabrina comments on v1 and v2 Link: https://lore.kernel.org/netdev/Zg1l9L2BNoZWZDZG@hog/ [1] BUG: KMSAN: uninit-value in geneve_xmit_skb drivers/net/geneve.c:910 [inline] BUG: KMSAN: uninit-value in geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 geneve_xmit_skb drivers/net/geneve.c:910 [inline] geneve_xmit+0x302d/0x5420 drivers/net/geneve.c:1030 __netdev_start_xmit include/linux/netdevice.h:4903 [inline] netdev_start_xmit include/linux/netdevice.h:4917 [inline] xmit_one net/core/dev.c:3531 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3547 __dev_queue_xmit+0x348d/0x52c0 net/core/dev.c:4335 dev_queue_xmit include/linux/netdevice.h:3091 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x8bb0/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x722d/0x9ef0 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:745 __sys_sendto+0x685/0x830 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2199 do_syscall_64+0xd5/0x1f0 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 0 PID: 5033 Comm: syz-executor346 Not tainted 6.9.0-rc1-syzkaller-00005-g928a87efa423 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024 Fixes: d13f048dd40e ("net: geneve: modify IP header check in geneve6_xmit_skb and geneve_xmit_skb") Reported-by: syzbot+9ee20ec1de7b3168db09@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/000000000000d19c3a06152f9ee4@google.com/ Signed-off-by: Eric Dumazet Cc: Phillip Potter Cc: Sabrina Dubroca Reviewed-by: Sabrina Dubroca Reviewed-by: Phillip Potter Signed-off-by: David S. Miller --- drivers/net/geneve.c | 4 ++-- include/net/ip_tunnels.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 2f6739fe78af..6c2835086b57 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -822,7 +822,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs4) @@ -929,7 +929,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!pskb_inet_may_pull(skb)) + if (!skb_vlan_inet_prepare(skb)) return -EINVAL; if (!gs6) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5cd64bb2104d..c286cc2e766e 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -361,6 +361,39 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) return pskb_network_may_pull(skb, nhlen); } +/* Variant of pskb_inet_may_pull(). + */ +static inline bool skb_vlan_inet_prepare(struct sk_buff *skb) +{ + int nhlen = 0, maclen = ETH_HLEN; + __be16 type = skb->protocol; + + /* Essentially this is skb_protocol(skb, true) + * And we get MAC len. + */ + if (eth_type_vlan(type)) + type = __vlan_get_protocol(skb, type, &maclen); + + switch (type) { +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + nhlen = sizeof(struct ipv6hdr); + break; +#endif + case htons(ETH_P_IP): + nhlen = sizeof(struct iphdr); + break; + } + /* For ETH_P_IPV6/ETH_P_IP we make sure to pull + * a base network header in skb->head. + */ + if (!pskb_may_pull(skb, maclen + nhlen)) + return false; + + skb_set_network_header(skb, maclen); + return true; +} + static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; -- cgit v1.2.3 From c1d11fc2c8320871b40730991071dd0a0b405bc8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2024 09:46:01 +0200 Subject: irqflags: Explicitly ignore lockdep_hrtimer_exit() argument When building with 'make W=1' but CONFIG_TRACE_IRQFLAGS=n, the unused argument to lockdep_hrtimer_exit() causes a warning: kernel/time/hrtimer.c:1655:14: error: variable 'expires_in_hardirq' set but not used [-Werror=unused-but-set-variable] This is intentional behavior, so add a cast to void to shut up the warning. Fixes: 73d20564e0dc ("hrtimer: Don't dereference the hrtimer pointer after the callback") Reported-by: kernel test robot Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240408074609.3170807-1-arnd@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202311191229.55QXHVc6-lkp@intel.com/ --- include/linux/irqflags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 147feebd508c..3f003d5fde53 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -114,7 +114,7 @@ do { \ # define lockdep_softirq_enter() do { } while (0) # define lockdep_softirq_exit() do { } while (0) # define lockdep_hrtimer_enter(__hrtimer) false -# define lockdep_hrtimer_exit(__context) do { } while (0) +# define lockdep_hrtimer_exit(__context) do { (void)(__context); } while (0) # define lockdep_posixtimer_enter() do { } while (0) # define lockdep_posixtimer_exit() do { } while (0) # define lockdep_irq_work_enter(__work) do { } while (0) -- cgit v1.2.3 From fa1f51162338b3e2f520d4bfedc42b3b2e00da6d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 19 Mar 2024 19:20:50 +0100 Subject: locking: Make rwsem_assert_held_write_nolockdep() build with PREEMPT_RT=y The commit cited below broke the build for PREEMPT_RT because rwsem_assert_held_write_nolockdep() passes a struct rw_semaphore but rw_base_assert_held_write() expects struct rwbase_rt. Fixing the type alone leads to the problem that WARN_ON() is not found because bug.h is missing. In order to resolve this: - Keep the assert (WARN_ON()) in rwsem.h (not rwbase_rt.h) - Make rwsem_assert_held_write_nolockdep() do the implementation specific (rw_base) writer check. - Replace the "inline" with __always_inline which was used before. Fixes: f70405afc99b1 ("locking: Add rwsem_assert_held() and rwsem_assert_held_write()") Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/20240319182050.U4AzUF3I@linutronix.de --- include/linux/rwbase_rt.h | 4 ++-- include/linux/rwsem.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 29c4e4f243e4..f2394a409c9d 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -31,9 +31,9 @@ static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) return atomic_read(&rwb->readers) != READER_BIAS; } -static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_write_locked(const struct rwbase_rt *rwb) { - WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); + return atomic_read(&rwb->readers) == WRITER_BIAS; } static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4f1c18992f76..c8b543d428b0 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -167,14 +167,14 @@ static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) return rw_base_is_locked(&sem->rwbase); } -static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } -static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { - rw_base_assert_held_write(sem); + WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) -- cgit v1.2.3 From aca1a5287ea328fd1f7e2bfa6806646486d86a70 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 28 Mar 2024 09:25:40 +0530 Subject: ACPI: bus: allow _UID matching for integer zero Commit b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") added _UID matching support for both integer and string types, which satisfies NULL @uid2 argument for string types using inversion, but this logic prevents _UID comparision in case the argument is integer 0, which may result in false positives. Fix this using _Generic(), which will allow NULL @uid2 argument for string types as well as _UID matching for all possible integer values. Fixes: b2b32a173881 ("ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types") Signed-off-by: Raag Jadav [ rjw: Comment adjustment ] Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5de954e2b18a..e7796f373d0d 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -911,17 +911,19 @@ static inline bool acpi_int_uid_match(struct acpi_device *adev, u64 uid2) * acpi_dev_hid_uid_match - Match device by supplied HID and UID * @adev: ACPI device to match. * @hid2: Hardware ID of the device. - * @uid2: Unique ID of the device, pass 0 or NULL to not check _UID. + * @uid2: Unique ID of the device, pass NULL to not check _UID. * * Matches HID and UID in @adev with given @hid2 and @uid2. Absence of @uid2 * will be treated as a match. If user wants to validate @uid2, it should be * done before calling this function. * - * Returns: %true if matches or @uid2 is 0 or NULL, %false otherwise. + * Returns: %true if matches or @uid2 is NULL, %false otherwise. */ #define acpi_dev_hid_uid_match(adev, hid2, uid2) \ (acpi_dev_hid_match(adev, hid2) && \ - (!(uid2) || acpi_dev_uid_match(adev, uid2))) + /* Distinguish integer 0 from NULL @uid2 */ \ + (_Generic(uid2, ACPI_STR_TYPES(!(uid2)), default: 0) || \ + acpi_dev_uid_match(adev, uid2))) void acpi_dev_clear_dependencies(struct acpi_device *supplier); bool acpi_dev_ready_for_enumeration(const struct acpi_device *device); -- cgit v1.2.3 From c722cea208789d9e2660992bcd05fb9fac3adb56 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 8 Apr 2024 21:43:58 -0700 Subject: fs/proc: Skip bootloader comment if no embedded kernel parameters If the "bootconfig" kernel command-line argument was specified or if the kernel was built with CONFIG_BOOT_CONFIG_FORCE, but if there are no embedded kernel parameter, omit the "# Parameters from bootloader:" comment from the /proc/bootconfig file. This will cause automation to fall back to the /proc/cmdline file, which will be identical to the comment in this no-embedded-kernel-parameters case. Link: https://lore.kernel.org/all/20240409044358.1156477-2-paulmck@kernel.org/ Fixes: 8b8ce6c75430 ("fs/proc: remove redundant comments from /proc/bootconfig") Signed-off-by: Masami Hiramatsu Signed-off-by: Paul E. McKenney Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- fs/proc/bootconfig.c | 2 +- include/linux/bootconfig.h | 1 + init/main.c | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index e5635a6b127b..87dcaae32ff8 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -63,7 +63,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) dst += ret; } } - if (ret >= 0 && boot_command_line[0]) { + if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) { ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", boot_command_line); if (ret > 0) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index ca73940e26df..e5ee2c694401 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,7 @@ #ifdef __KERNEL__ #include #include +bool __init cmdline_has_extra_options(void); #else /* !__KERNEL__ */ /* * NOTE: This is only for tools/bootconfig, because tools/bootconfig will diff --git a/init/main.c b/init/main.c index 2ca52474d0c3..881f6230ee59 100644 --- a/init/main.c +++ b/init/main.c @@ -487,6 +487,11 @@ static int __init warn_bootconfig(char *str) early_param("bootconfig", warn_bootconfig); +bool __init cmdline_has_extra_options(void) +{ + return extra_command_line || extra_init_args; +} + /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) { -- cgit v1.2.3 From d7a62d0a9a17e97ce2d4d40431094e09956a0568 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 9 Apr 2024 17:46:23 +0200 Subject: compiler.h: Add missing quote in macro comment Add a missing doublequote in the __is_constexpr() macro comment. Signed-off-by: Thorsten Blum Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index c00cc6c0878a..8c252e073bd8 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -268,7 +268,7 @@ static inline void *offset_to_ptr(const int *off) * - When one operand is a null pointer constant (i.e. when x is an integer * constant expression) and the other is an object pointer (i.e. our * third operand), the conditional operator returns the type of the - * object pointer operand (i.e. "int *). Here, within the sizeof(), we + * object pointer operand (i.e. "int *"). Here, within the sizeof(), we * would then get: * sizeof(*((int *)(...)) == sizeof(int) == 4 * - When one operand is a void pointer (i.e. when x is not an integer -- cgit v1.2.3 From ff81dade48608363136d52bb2493a6df76458b28 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Wed, 10 Apr 2024 01:35:28 +0800 Subject: io-uring: correct typo in comment for IOU_F_TWQ_LAZY_WAKE The 'r' key is near to 't' key, that makes 'with' to be 'wirh' ? :) Signed-off-by: Haiyue Wang Link: https://lore.kernel.org/r/20240409173531.846714-1-haiyue.wang@intel.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 05df0e399d7c..ac333ea81d31 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -13,7 +13,7 @@ enum { * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. * - * Must not be used wirh requests generating more than one CQE. + * Must not be used with requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ IOU_F_TWQ_LAZY_WAKE = 1, -- cgit v1.2.3 From 6309863b31dd80317cd7d6824820b44e254e2a9c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2024 08:28:43 +0000 Subject: net: add copy_safe_from_sockptr() helper copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 307961b41541..317200cd3a60 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_struct_from_sockptr(void *dst, size_t ksize, sockptr_t src, size_t usize) { -- cgit v1.2.3 From 7633c4da919ad51164acbf1aa322cc1a3ead6129 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 8 Apr 2024 16:18:21 +0200 Subject: ipv6: fix race condition between ipv6_get_ifaddr and ipv6_del_addr Although ipv6_get_ifaddr walks inet6_addr_lst under the RCU lock, it still means hlist_for_each_entry_rcu can return an item that got removed from the list. The memory itself of such item is not freed thanks to RCU but nothing guarantees the actual content of the memory is sane. In particular, the reference count can be zero. This can happen if ipv6_del_addr is called in parallel. ipv6_del_addr removes the entry from inet6_addr_lst (hlist_del_init_rcu(&ifp->addr_lst)) and drops all references (__in6_ifa_put(ifp) + in6_ifa_put(ifp)). With bad enough timing, this can happen: 1. In ipv6_get_ifaddr, hlist_for_each_entry_rcu returns an entry. 2. Then, the whole ipv6_del_addr is executed for the given entry. The reference count drops to zero and kfree_rcu is scheduled. 3. ipv6_get_ifaddr continues and tries to increments the reference count (in6_ifa_hold). 4. The rcu is unlocked and the entry is freed. 5. The freed entry is returned. Prevent increasing of the reference count in such case. The name in6_ifa_hold_safe is chosen to mimic the existing fib6_info_hold_safe. [ 41.506330] refcount_t: addition on 0; use-after-free. [ 41.506760] WARNING: CPU: 0 PID: 595 at lib/refcount.c:25 refcount_warn_saturate+0xa5/0x130 [ 41.507413] Modules linked in: veth bridge stp llc [ 41.507821] CPU: 0 PID: 595 Comm: python3 Not tainted 6.9.0-rc2.main-00208-g49563be82afa #14 [ 41.508479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) [ 41.509163] RIP: 0010:refcount_warn_saturate+0xa5/0x130 [ 41.509586] Code: ad ff 90 0f 0b 90 90 c3 cc cc cc cc 80 3d c0 30 ad 01 00 75 a0 c6 05 b7 30 ad 01 01 90 48 c7 c7 38 cc 7a 8c e8 cc 18 ad ff 90 <0f> 0b 90 90 c3 cc cc cc cc 80 3d 98 30 ad 01 00 0f 85 75 ff ff ff [ 41.510956] RSP: 0018:ffffbda3c026baf0 EFLAGS: 00010282 [ 41.511368] RAX: 0000000000000000 RBX: ffff9e9c46914800 RCX: 0000000000000000 [ 41.511910] RDX: ffff9e9c7ec29c00 RSI: ffff9e9c7ec1c900 RDI: ffff9e9c7ec1c900 [ 41.512445] RBP: ffff9e9c43660c9c R08: 0000000000009ffb R09: 00000000ffffdfff [ 41.512998] R10: 00000000ffffdfff R11: ffffffff8ca58a40 R12: ffff9e9c4339a000 [ 41.513534] R13: 0000000000000001 R14: ffff9e9c438a0000 R15: ffffbda3c026bb48 [ 41.514086] FS: 00007fbc4cda1740(0000) GS:ffff9e9c7ec00000(0000) knlGS:0000000000000000 [ 41.514726] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 41.515176] CR2: 000056233b337d88 CR3: 000000000376e006 CR4: 0000000000370ef0 [ 41.515713] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 41.516252] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 41.516799] Call Trace: [ 41.517037] [ 41.517249] ? __warn+0x7b/0x120 [ 41.517535] ? refcount_warn_saturate+0xa5/0x130 [ 41.517923] ? report_bug+0x164/0x190 [ 41.518240] ? handle_bug+0x3d/0x70 [ 41.518541] ? exc_invalid_op+0x17/0x70 [ 41.520972] ? asm_exc_invalid_op+0x1a/0x20 [ 41.521325] ? refcount_warn_saturate+0xa5/0x130 [ 41.521708] ipv6_get_ifaddr+0xda/0xe0 [ 41.522035] inet6_rtm_getaddr+0x342/0x3f0 [ 41.522376] ? __pfx_inet6_rtm_getaddr+0x10/0x10 [ 41.522758] rtnetlink_rcv_msg+0x334/0x3d0 [ 41.523102] ? netlink_unicast+0x30f/0x390 [ 41.523445] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 41.523832] netlink_rcv_skb+0x53/0x100 [ 41.524157] netlink_unicast+0x23b/0x390 [ 41.524484] netlink_sendmsg+0x1f2/0x440 [ 41.524826] __sys_sendto+0x1d8/0x1f0 [ 41.525145] __x64_sys_sendto+0x1f/0x30 [ 41.525467] do_syscall_64+0xa5/0x1b0 [ 41.525794] entry_SYSCALL_64_after_hwframe+0x72/0x7a [ 41.526213] RIP: 0033:0x7fbc4cfcea9a [ 41.526528] Code: d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 7e c3 0f 1f 44 00 00 41 54 48 83 ec 30 44 89 [ 41.527942] RSP: 002b:00007ffcf54012a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 41.528593] RAX: ffffffffffffffda RBX: 00007ffcf5401368 RCX: 00007fbc4cfcea9a [ 41.529173] RDX: 000000000000002c RSI: 00007fbc4b9d9bd0 RDI: 0000000000000005 [ 41.529786] RBP: 00007fbc4bafb040 R08: 00007ffcf54013e0 R09: 000000000000000c [ 41.530375] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [ 41.530977] R13: ffffffffc4653600 R14: 0000000000000001 R15: 00007fbc4ca85d1b [ 41.531573] Fixes: 5c578aedcb21d ("IPv6: convert addrconf hash list to RCU") Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jiri Benc Link: https://lore.kernel.org/r/8ab821e36073a4a406c50ec83c9e8dc586c539e4.1712585809.git.jbenc@redhat.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 4 ++++ net/ipv6/addrconf.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9d06eb945509..62a407db1bf5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -438,6 +438,10 @@ static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) refcount_inc(&ifp->refcnt); } +static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) +{ + return refcount_inc_not_zero(&ifp->refcnt); +} /* * compute link-local solicited-node multicast address diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 92db9b474f2b..779aa6ecdd49 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2091,9 +2091,10 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { - result = ifp; - in6_ifa_hold(ifp); - break; + if (in6_ifa_hold_safe(ifp)) { + result = ifp; + break; + } } } } -- cgit v1.2.3 From 0871bc0129d403747ea0272a4384895d7ad37a6c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 10 Apr 2024 21:08:51 +0800 Subject: mm: Move lowmem_page_address() a little later LoongArch will override page_to_virt() which use page_address() in the KFENCE case (by defining WANT_PAGE_VIRTUAL/HASHED_PAGE_VIRTUAL). So move lowmem_page_address() a little later to avoid such build errors: error: implicit declaration of function 'page_address'. Acked-by: Andrew Morton Signed-off-by: Huacai Chen --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0436b919f1c7..7b0ee64225de 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,11 +2207,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2234,6 +2229,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) -- cgit v1.2.3 From 51eda36d33e43201e7a4fd35232e069b2c850b01 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:41:52 -0400 Subject: Bluetooth: SCO: Fix not validating setsockopt user input syzbot reported sco_sock_setsockopt() is copying data without checking user input length. BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in sco_sock_setsockopt+0xc0b/0xf90 net/bluetooth/sco.c:893 Read of size 4 at addr ffff88805f7b15a3 by task syz-executor.5/12578 Fixes: ad10b1a48754 ("Bluetooth: Add Bluetooth socket voice option") Fixes: b96e9c671b05 ("Bluetooth: Add BT_DEFER_SETUP option to sco socket") Fixes: 00398e1d5183 ("Bluetooth: Add support for BT_PKT_STATUS CMSG data for SCO connections") Fixes: f6873401a608 ("Bluetooth: Allow setting of codec for HFP offload use case") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 9 +++++++++ net/bluetooth/sco.c | 23 ++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9fe95a22abeb..eaec5d6caa29 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -585,6 +585,15 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, return skb; } +static inline int bt_copy_from_sockptr(void *dst, size_t dst_size, + sockptr_t src, size_t src_size) +{ + if (dst_size > src_size) + return -EINVAL; + + return copy_from_sockptr(dst, src, dst_size); +} + int bt_to_errno(u16 code); __u8 bt_status(int err); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 43daf965a01e..368e026f4d15 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -824,7 +824,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_voice voice; u32 opt; struct bt_codecs *codecs; @@ -843,10 +843,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -863,11 +862,10 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, voice.setting = sco_pi(sk)->setting; - len = min_t(unsigned int, sizeof(voice), optlen); - if (copy_from_sockptr(&voice, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(&voice, sizeof(voice), optval, + optlen); + if (err) break; - } /* Explicitly check for these values */ if (voice.setting != BT_VOICE_TRANSPARENT && @@ -890,10 +888,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; case BT_PKT_STATUS: - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); @@ -934,9 +931,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(buffer, optval, optlen)) { + err = bt_copy_from_sockptr(buffer, optlen, optval, optlen); + if (err) { hci_dev_put(hdev); - err = -EFAULT; break; } -- cgit v1.2.3 From 5284984a4fbacb0883bfebe905902cdda2891a07 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 10 Apr 2024 18:32:12 +0300 Subject: bug: Fix no-return-statement warning with !CONFIG_BUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BUG() does not return, and arch implementations of BUG() use unreachable() or other non-returning code. However with !CONFIG_BUG, the default implementation is often used instead, and that does not do that. x86 always uses its own implementation, but powerpc with !CONFIG_BUG gives a build error: kernel/time/timekeeping.c: In function ‘timekeeping_debug_get_ns’: kernel/time/timekeeping.c:286:1: error: no return statement in function returning non-void [-Werror=return-type] Add unreachable() to default !CONFIG_BUG BUG() implementation. Fixes: e8e9d21a5df6 ("timekeeping: Refactor timekeeping helpers") Reported-by: Naresh Kamboju Signed-off-by: Adrian Hunter Signed-off-by: Thomas Gleixner Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20240410153212.127477-1-adrian.hunter@intel.com Closes: https://lore.kernel.org/all/CA+G9fYvjdZCW=7ZGxS6A_3bysjQ56YF7S-+PNLQ_8a4DKh1Bhg@mail.gmail.com/ --- include/asm-generic/bug.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6e794420bd39..b7de3a4eade1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -156,7 +156,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #else /* !CONFIG_BUG */ #ifndef HAVE_ARCH_BUG -#define BUG() do {} while (1) +#define BUG() do { \ + do {} while (1); \ + unreachable(); \ +} while (0) #endif #ifndef HAVE_ARCH_BUG_ON -- cgit v1.2.3 From 211f514ebf1ef5de37b1cf6df9d28a56cfd242ca Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Mon, 11 Mar 2024 09:15:55 -0700 Subject: Drivers: hv: vmbus: Track decrypted status in vmbus_gpadl In CoCo VMs it is possible for the untrusted host to cause set_memory_encrypted() or set_memory_decrypted() to fail such that an error is returned and the resulting memory is shared. Callers need to take care to handle these errors to avoid returning decrypted (shared) memory to the page allocator, which could lead to functional or security issues. In order to make sure callers of vmbus_establish_gpadl() and vmbus_teardown_gpadl() don't return decrypted/shared pages to allocators, add a field in struct vmbus_gpadl to keep track of the decryption status of the buffers. This will allow the callers to know if they should free or leak the pages. Signed-off-by: Rick Edgecombe Signed-off-by: Michael Kelley Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/r/20240311161558.1310-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20240311161558.1310-3-mhklinux@outlook.com> --- drivers/hv/channel.c | 25 +++++++++++++++++++++---- include/linux/hyperv.h | 1 + 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index adbf674355b2..98259b492502 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -436,9 +436,18 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) + if (ret) { + gpadl->decrypted = false; return ret; + } + /* + * Set the "decrypted" flag to true for the set_memory_decrypted() + * success case. In the failure case, the encryption state of the + * memory is unknown. Leave "decrypted" as true to ensure the + * memory will be leaked instead of going back on the free list. + */ + gpadl->decrypted = true; ret = set_memory_decrypted((unsigned long)kbuffer, PFN_UP(size)); if (ret) { @@ -527,9 +536,15 @@ cleanup: kfree(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } return ret; } @@ -850,6 +865,8 @@ post_msg_err: if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 6ef0557b4bff..96ceb4095425 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -832,6 +832,7 @@ struct vmbus_gpadl { u32 gpadl_handle; u32 size; void *buffer; + bool decrypted; }; struct vmbus_channel { -- cgit v1.2.3 From a4833e3abae132d613ce7da0e0c9a9465d1681fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 10 Apr 2024 12:38:13 -0400 Subject: SUNRPC: Fix rpcgss_context trace event acceptor field The rpcgss_context trace event acceptor field is a dynamically sized string that records the "data" parameter. But this parameter is also dependent on the "len" field to determine the size of the data. It needs to use __string_len() helper macro where the length can be passed in. It also incorrectly uses strncpy() to save it instead of __assign_str(). As these macros can change, it is not wise to open code them in trace events. As of commit c759e609030c ("tracing: Remove __assign_str_len()"), __assign_str() can be used for both __string() and __string_len() fields. Before that commit, __assign_str_len() is required to be used. This needs to be noted for backporting. (In actuality, commit c1fa617caeb0 ("tracing: Rework __assign_str() and __string() to not duplicate getting the string") is the commit that makes __string_str_len() obsolete). Cc: stable@vger.kernel.org Fixes: 0c77668ddb4e ("SUNRPC: Introduce trace points in rpc_auth_gss.ko") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index ba2d96a1bc2f..f50fcafc69de 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -609,7 +609,7 @@ TRACE_EVENT(rpcgss_context, __field(unsigned int, timeout) __field(u32, window_size) __field(int, len) - __string(acceptor, data) + __string_len(acceptor, data, len) ), TP_fast_assign( @@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context, __entry->timeout = timeout; __entry->window_size = window_size; __entry->len = len; - strncpy(__get_str(acceptor), data, len); + __assign_str(acceptor, data); ), TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s", -- cgit v1.2.3 From 29b359cf6d95fd60730533f7f10464e95bd17c73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 10 Apr 2024 18:50:45 +0200 Subject: netfilter: nft_set_pipapo: walk over current view on netlink dump The generation mask can be updated while netlink dump is in progress. The pipapo set backend walk iterator cannot rely on it to infer what view of the datastructure is to be used. Add notation to specify if user wants to read/update the set. Based on patch from Florian Westphal. Fixes: 2b84e215f874 ("netfilter: nft_set_pipapo: .walk does not deal with generations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_set_pipapo.c | 5 +++-- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e27c28b612e4..3f1ed467f951 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -307,9 +307,23 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) return (void *)priv; } + +/** + * enum nft_iter_type - nftables set iterator type + * + * @NFT_ITER_READ: read-only iteration over set elements + * @NFT_ITER_UPDATE: iteration under mutex to update set element state + */ +enum nft_iter_type { + NFT_ITER_UNSPEC, + NFT_ITER_READ, + NFT_ITER_UPDATE, +}; + struct nft_set; struct nft_set_iter { u8 genmask; + enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f11d0c0a2c73..a7a34db62ea9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -626,6 +626,7 @@ static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_deactivate, }; @@ -5445,6 +5446,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, } iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; @@ -5518,6 +5520,7 @@ static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set) { struct nft_set_iter iter = { .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, .fn = nft_mapelem_activate, }; @@ -5892,6 +5895,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.skb = skb; args.reset = dump_ctx->reset; args.iter.genmask = nft_genmask_cur(net); + args.iter.type = NFT_ITER_READ; args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; @@ -7376,6 +7380,7 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { .genmask = genmask, + .type = NFT_ITER_UPDATE, .fn = nft_setelem_flush, }; @@ -10879,6 +10884,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, continue; iter.genmask = nft_genmask_next(ctx->net); + iter.type = NFT_ITER_UPDATE; iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index df8de5090246..11e44e4dfb1f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2115,13 +2115,14 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { struct nft_pipapo *priv = nft_set_priv(set); - struct net *net = read_pnet(&set->net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; + WARN_ON_ONCE(iter->type == NFT_ITER_UNSPEC); + rcu_read_lock(); - if (iter->genmask == nft_genmask_cur(net)) + if (iter->type == NFT_ITER_READ) m = rcu_dereference(priv->match); else m = priv->clone; -- cgit v1.2.3 From 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Apr 2024 13:47:33 +0200 Subject: netfilter: flowtable: validate pppoe header Ensure there is sufficient room to access the protocol field of the PPPoe header. Validate it once before the flowtable lookup, then use a helper function to access protocol field. Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +++++++++++- net/netfilter/nf_flow_table_inet.c | 3 ++- net/netfilter/nf_flow_table_ip.c | 8 +++++--- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a763dd327c6e..9abb7ee40d72 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -336,7 +336,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) { __be16 proto; @@ -352,6 +352,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) return 0; } +static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) +{ + if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + return false; + + *inner_proto = __nf_flow_pppoe_proto(skb); + + return true; +} + #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count) #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count) \ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 9505f9d188ff..6eef15648b7b 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, proto = veth->h_vlan_encapsulated_proto; break; case htons(ETH_P_PPP_SES): - proto = nf_flow_pppoe_proto(skb); + if (!nf_flow_pppoe_proto(skb, &proto)) + return NF_ACCEPT; break; default: proto = skb->protocol; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e45fade76409..9e9e105052da 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -273,10 +273,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, +static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; + __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -287,7 +288,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, } break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_proto(skb) == proto) { + if (nf_flow_pppoe_proto(skb, &inner_proto) && + inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } @@ -316,7 +318,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): - skb->protocol = nf_flow_pppoe_proto(skb); + skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; -- cgit v1.2.3 From ebaed6d4def877d2035786ff318379eb750044c8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Mar 2024 11:29:10 -0700 Subject: peci: linux/peci.h: fix Excess kernel-doc description warning Remove the @controller: line to prevent the kernel-doc warning: include/linux/peci.h:84: warning: Excess struct member 'controller' description in 'peci_device' Signed-off-by: Randy Dunlap Cc: Iwona Winiarska Cc: openbmc@lists.ozlabs.org Reviewed-by: Iwona Winiarska Fixes: 6523d3b2ffa2 ("peci: Add core infrastructure") Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240329182910.29495-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/peci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/peci.h b/include/linux/peci.h index 9b3d36aff431..90e241458ef6 100644 --- a/include/linux/peci.h +++ b/include/linux/peci.h @@ -58,7 +58,6 @@ static inline struct peci_controller *to_peci_controller(void *d) /** * struct peci_device - PECI device * @dev: device object to register PECI device to the device model - * @controller: manages the bus segment hosting this PECI device * @info: PECI device characteristics * @info.family: device family * @info.model: device model -- cgit v1.2.3 From 89f9a1e876b5a7ad884918c03a46831af202c8a0 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Sun, 14 Apr 2024 19:49:45 +0800 Subject: bootconfig: use memblock_free_late to free xbc memory to buddy On the time to free xbc memory in xbc_exit(), memblock may has handed over memory to buddy allocator. So it doesn't make sense to free memory back to memblock. memblock_free() called by xbc_exit() even causes UAF bugs on architectures with CONFIG_ARCH_KEEP_MEMBLOCK disabled like x86. Following KASAN logs shows this case. This patch fixes the xbc memory free problem by calling memblock_free() in early xbc init error rewind path and calling memblock_free_late() in xbc exit path to free memory to buddy allocator. [ 9.410890] ================================================================== [ 9.418962] BUG: KASAN: use-after-free in memblock_isolate_range+0x12d/0x260 [ 9.426850] Read of size 8 at addr ffff88845dd30000 by task swapper/0/1 [ 9.435901] CPU: 9 PID: 1 Comm: swapper/0 Tainted: G U 6.9.0-rc3-00208-g586b5dfb51b9 #5 [ 9.446403] Hardware name: Intel Corporation RPLP LP5 (CPU:RaptorLake)/RPLP LP5 (ID:13), BIOS IRPPN02.01.01.00.00.19.015.D-00000000 Dec 28 2023 [ 9.460789] Call Trace: [ 9.463518] [ 9.465859] dump_stack_lvl+0x53/0x70 [ 9.469949] print_report+0xce/0x610 [ 9.473944] ? __virt_addr_valid+0xf5/0x1b0 [ 9.478619] ? memblock_isolate_range+0x12d/0x260 [ 9.483877] kasan_report+0xc6/0x100 [ 9.487870] ? memblock_isolate_range+0x12d/0x260 [ 9.493125] memblock_isolate_range+0x12d/0x260 [ 9.498187] memblock_phys_free+0xb4/0x160 [ 9.502762] ? __pfx_memblock_phys_free+0x10/0x10 [ 9.508021] ? mutex_unlock+0x7e/0xd0 [ 9.512111] ? __pfx_mutex_unlock+0x10/0x10 [ 9.516786] ? kernel_init_freeable+0x2d4/0x430 [ 9.521850] ? __pfx_kernel_init+0x10/0x10 [ 9.526426] xbc_exit+0x17/0x70 [ 9.529935] kernel_init+0x38/0x1e0 [ 9.533829] ? _raw_spin_unlock_irq+0xd/0x30 [ 9.538601] ret_from_fork+0x2c/0x50 [ 9.542596] ? __pfx_kernel_init+0x10/0x10 [ 9.547170] ret_from_fork_asm+0x1a/0x30 [ 9.551552] [ 9.555649] The buggy address belongs to the physical page: [ 9.561875] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x45dd30 [ 9.570821] flags: 0x200000000000000(node=0|zone=2) [ 9.576271] page_type: 0xffffffff() [ 9.580167] raw: 0200000000000000 ffffea0011774c48 ffffea0012ba1848 0000000000000000 [ 9.588823] raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 [ 9.597476] page dumped because: kasan: bad access detected [ 9.605362] Memory state around the buggy address: [ 9.610714] ffff88845dd2ff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.618786] ffff88845dd2ff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 9.626857] >ffff88845dd30000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.634930] ^ [ 9.638534] ffff88845dd30080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.646605] ffff88845dd30100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff [ 9.654675] ================================================================== Link: https://lore.kernel.org/all/20240414114944.1012359-1-qiang4.zhang@linux.intel.com/ Fixes: 40caa127f3c7 ("init: bootconfig: Remove all bootconfig data when the init memory is removed") Cc: Stable@vger.kernel.org Signed-off-by: Qiang Zhang Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 7 ++++++- lib/bootconfig.c | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index e5ee2c694401..3f4b4ac527ca 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -288,7 +288,12 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_exit(void); +void __init _xbc_exit(bool early); + +static inline void xbc_exit(void) +{ + _xbc_exit(false); +} /* XBC embedded bootconfig data in kernel */ #ifdef CONFIG_BOOT_CONFIG_EMBED diff --git a/lib/bootconfig.c b/lib/bootconfig.c index c59d26068a64..8841554432d5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -61,9 +61,12 @@ static inline void * __init xbc_alloc_mem(size_t size) return memblock_alloc(size, SMP_CACHE_BYTES); } -static inline void __init xbc_free_mem(void *addr, size_t size) +static inline void __init xbc_free_mem(void *addr, size_t size, bool early) { - memblock_free(addr, size); + if (early) + memblock_free(addr, size); + else if (addr) + memblock_free_late(__pa(addr), size); } #else /* !__KERNEL__ */ @@ -73,7 +76,7 @@ static inline void *xbc_alloc_mem(size_t size) return malloc(size); } -static inline void xbc_free_mem(void *addr, size_t size) +static inline void xbc_free_mem(void *addr, size_t size, bool early) { free(addr); } @@ -904,13 +907,13 @@ static int __init xbc_parse_tree(void) * If you need to reuse xbc_init() with new boot config, you can * use this. */ -void __init xbc_exit(void) +void __init _xbc_exit(bool early) { - xbc_free_mem(xbc_data, xbc_data_size); + xbc_free_mem(xbc_data, xbc_data_size, early); xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX, early); xbc_nodes = NULL; brace_index = 0; } @@ -963,7 +966,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; - xbc_exit(); + _xbc_exit(true); return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); @@ -977,7 +980,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) *epos = xbc_err_pos; if (emsg) *emsg = xbc_err_msg; - xbc_exit(); + _xbc_exit(true); } else ret = xbc_node_num; -- cgit v1.2.3 From 1382e3b6a3500c245e5278c66d210c02926f804f Mon Sep 17 00:00:00 2001 From: Yuri Benditovich Date: Thu, 11 Apr 2024 08:11:24 +0300 Subject: net: change maximum number of UDP segments to 128 The commit fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") adds check of potential number of UDP segments vs UDP_MAX_SEGMENTS in linux/virtio_net.h. After this change certification test of USO guest-to-guest transmit on Windows driver for virtio-net device fails, for example with packet size of ~64K and mss of 536 bytes. In general the USO should not be more restrictive than TSO. Indeed, in case of unreasonably small mss a lot of segments can cause queue overflow and packet loss on the destination. Limit of 128 segments is good for any practical purpose, with minimal meaningful mss of 536 the maximal UDP packet will be divided to ~120 segments. The number of segments for UDP packets is validated vs UDP_MAX_SEGMENTS also in udp.c (v4,v6), this does not affect quest-to-guest path but does affect packets sent to host, for example. It is important to mention that UDP_MAX_SEGMENTS is kernel-only define and not available to user mode socket applications. In order to request MSS smaller than MTU the applications just uses setsockopt with SOL_UDP and UDP_SEGMENT and there is no limitations on socket API level. Fixes: fc8b2a619469 ("net: more strict VIRTIO_NET_HDR_GSO_UDP_L4 validation") Signed-off-by: Yuri Benditovich Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 2 +- tools/testing/selftests/net/udpgso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 17539d089666..e398e1dbd2d3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c index 1d975bf52af3..85b3baa3f7f3 100644 --- a/tools/testing/selftests/net/udpgso.c +++ b/tools/testing/selftests/net/udpgso.c @@ -34,7 +34,7 @@ #endif #ifndef UDP_MAX_SEGMENTS -#define UDP_MAX_SEGMENTS (1 << 6UL) +#define UDP_MAX_SEGMENTS (1 << 7UL) #endif #define CONST_MTU_TEST 1500 -- cgit v1.2.3 From fe90f3967bdb3e13f133e5f44025e15f943a99c5 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 15 Apr 2024 11:21:13 -0400 Subject: sched: Add missing memory barrier in switch_mm_cid Many architectures' switch_mm() (e.g. arm64) do not have an smp_mb() which the core scheduler code has depended upon since commit: commit 223baf9d17f25 ("sched: Fix performance regression introduced by mm_cid") If switch_mm() doesn't call smp_mb(), sched_mm_cid_remote_clear() can unset the actively used cid when it fails to observe active task after it sets lazy_put. There *is* a memory barrier between storing to rq->curr and _return to userspace_ (as required by membarrier), but the rseq mm_cid has stricter requirements: the barrier needs to be issued between store to rq->curr and switch_mm_cid(), which happens earlier than: - spin_unlock(), - switch_to(). So it's fine when the architecture switch_mm() happens to have that barrier already, but less so when the architecture only provides the full barrier in switch_to() or spin_unlock(). It is a bug in the rseq switch_mm_cid() implementation. All architectures that don't have memory barriers in switch_mm(), but rather have the full barrier either in finish_lock_switch() or switch_to() have them too late for the needs of switch_mm_cid(). Introduce a new smp_mb__after_switch_mm(), defined as smp_mb() in the generic barrier.h header, and use it in switch_mm_cid() for scheduler transitions where switch_mm() is expected to provide a memory barrier. Architectures can override smp_mb__after_switch_mm() if their switch_mm() implementation provides an implicit memory barrier. Override it with a no-op on x86 which implicitly provide this memory barrier by writing to CR3. Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid") Reported-by: levi.yun Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Reviewed-by: Catalin Marinas # for arm64 Acked-by: Dave Hansen # for x86 Cc: # 6.4.x Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240415152114.59122-2-mathieu.desnoyers@efficios.com --- arch/x86/include/asm/barrier.h | 3 +++ include/asm-generic/barrier.h | 8 ++++++++ kernel/sched/sched.h | 20 ++++++++++++++------ 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index fe1e7e3cc844..63bdc6b85219 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -79,6 +79,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) + #include #endif /* _ASM_X86_BARRIER_H */ diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 0c0695763bea..d4f581c1e21d 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -294,5 +294,13 @@ do { \ #define io_stop_wc() do { } while (0) #endif +/* + * Architectures that guarantee an implicit smp_mb() in switch_mm() + * can override smp_mb__after_switch_mm. + */ +#ifndef smp_mb__after_switch_mm +# define smp_mb__after_switch_mm() smp_mb() +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2242679239e..ae50f212775e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -79,6 +79,8 @@ # include #endif +#include + #include "cpupri.h" #include "cpudeadline.h" @@ -3445,13 +3447,19 @@ static inline void switch_mm_cid(struct rq *rq, * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. * Provide it here. */ - if (!prev->mm) // from kernel + if (!prev->mm) { // from kernel smp_mb(); - /* - * user -> user transition guarantees a memory barrier through - * switch_mm() when current->mm changes. If current->mm is - * unchanged, no barrier is needed. - */ + } else { // from user + /* + * user->user transition relies on an implicit + * memory barrier in switch_mm() when + * current->mm changes. If the architecture + * switch_mm() does not have an implicit memory + * barrier, it is emitted here. If current->mm + * is unchanged, no barrier is needed. + */ + smp_mb__after_switch_mm(); + } } if (prev->mm_cid_active) { mm_cid_snapshot_time(rq, prev->mm); -- cgit v1.2.3 From 07a57a338adb6ec9e766d6a6790f76527f45ceb5 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Sun, 7 Apr 2024 15:05:37 +0200 Subject: mm,swapops: update check in is_pfn_swap_entry for hwpoison entries Tony reported that the Machine check recovery was broken in v6.9-rc1, as he was hitting a VM_BUG_ON when injecting uncorrectable memory errors to DRAM. After some more digging and debugging on his side, he realized that this went back to v6.1, with the introduction of 'commit 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry")'. That commit, among other things, introduced swp_offset_pfn(), replacing hwpoison_entry_to_pfn() in its favour. The patch also introduced a VM_BUG_ON() check for is_pfn_swap_entry(), but is_pfn_swap_entry() never got updated to cover hwpoison entries, which means that we would hit the VM_BUG_ON whenever we would call swp_offset_pfn() for such entries on environments with CONFIG_DEBUG_VM set. Fix this by updating the check to cover hwpoison entries as well, and update the comment while we are it. Link: https://lkml.kernel.org/r/20240407130537.16977-1-osalvador@suse.de Fixes: 0d206b5d2e0d ("mm/swap: add swp_offset_pfn() to fetch PFN from swap entry") Signed-off-by: Oscar Salvador Reported-by: Tony Luck Closes: https://lore.kernel.org/all/Zg8kLSl2yAlA3o5D@agluck-desk3/ Tested-by: Tony Luck Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Acked-by: Miaohe Lin Cc: [6.1.x] Signed-off-by: Andrew Morton --- include/linux/swapops.h | 65 +++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 48b700ba1d18..a5c560a2f8c2 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) } #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_MEMORY_FAILURE + +/* + * Support for hardware poisoned pages + */ +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(SWP_HWPOISON, page_to_pfn(page)); +} + +static inline int is_hwpoison_entry(swp_entry_t entry) +{ + return swp_type(entry) == SWP_HWPOISON; +} + +#else + +static inline swp_entry_t make_hwpoison_entry(struct page *page) +{ + return swp_entry(0, 0); +} + +static inline int is_hwpoison_entry(swp_entry_t swp) +{ + return 0; +} +#endif + typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) @@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) /* * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They are used to represent unaddressable device memory - * and to restrict access to a page undergoing migration. + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { @@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry); + is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; @@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#ifdef CONFIG_MEMORY_FAILURE - -/* - * Support for hardware poisoned pages - */ -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - BUG_ON(!PageLocked(page)); - return swp_entry(SWP_HWPOISON, page_to_pfn(page)); -} - -static inline int is_hwpoison_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_HWPOISON; -} - -#else - -static inline swp_entry_t make_hwpoison_entry(struct page *page) -{ - return swp_entry(0, 0); -} - -static inline int is_hwpoison_entry(swp_entry_t swp) -{ - return 0; -} -#endif - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; -- cgit v1.2.3 From 1f737846aa3c45f07a06fa0d018b39e1afb8084a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Tue, 9 Apr 2024 17:54:07 +0200 Subject: mm/shmem: inline shmem_is_huge() for disabled transparent hugepages In order to minimize code size (CONFIG_CC_OPTIMIZE_FOR_SIZE=y), compiler might choose to make a regular function call (out-of-line) for shmem_is_huge() instead of inlining it. When transparent hugepages are disabled (CONFIG_TRANSPARENT_HUGEPAGE=n), it can cause compilation error. mm/shmem.c: In function `shmem_getattr': ./include/linux/huge_mm.h:383:27: note: in expansion of macro `BUILD_BUG' 383 | #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) | ^~~~~~~~~ mm/shmem.c:1148:33: note: in expansion of macro `HPAGE_PMD_SIZE' 1148 | stat->blksize = HPAGE_PMD_SIZE; To prevent the possible error, always inline shmem_is_huge() when transparent hugepages are disabled. Link: https://lkml.kernel.org/r/20240409155407.2322714-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ilya Leoshkevich Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++++++ mm/shmem.c | 6 ------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a4c15db2f5e5..3fb18f7eb73e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, struct mm_struct *mm, unsigned long vm_flags); +#else +static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) +{ + return false; +} +#endif + #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b..94ab99b6b574 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) -{ - return false; -} - static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { -- cgit v1.2.3 From 69ffed4b62523bbc85511f150500329d28aba356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Apr 2024 17:19:13 +0300 Subject: gpiolib: swnode: Remove wrong header inclusion The flags in the software node properties are supposed to be the GPIO lookup flags, which are provided by gpio/machine.h, as the software nodes are the kernel internal thing and doesn't need to rely to any of ABIs. Fixes: e7f9ff5dc90c ("gpiolib: add support for software nodes") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/property.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h index 6c75c8bd44a0..1a14e239221f 100644 --- a/include/linux/gpio/property.h +++ b/include/linux/gpio/property.h @@ -2,7 +2,6 @@ #ifndef __LINUX_GPIO_PROPERTY_H #define __LINUX_GPIO_PROPERTY_H -#include /* for GPIO_* flags */ #include #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \ -- cgit v1.2.3 From 0f022d32c3eca477fbf79a205243a6123ed0fe11 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Apr 2024 18:07:28 -0300 Subject: net/sched: Fix mirred deadlock on device recursion When the mirred action is used on a classful egress qdisc and a packet is mirrored or redirected to self we hit a qdisc lock deadlock. See trace below. [..... other info removed for brevity....] [ 82.890906] [ 82.890906] ============================================ [ 82.890906] WARNING: possible recursive locking detected [ 82.890906] 6.8.0-05205-g77fadd89fe2d-dirty #213 Tainted: G W [ 82.890906] -------------------------------------------- [ 82.890906] ping/418 is trying to acquire lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] but task is already holding lock: [ 82.890906] ffff888006994110 (&sch->q.lock){+.-.}-{3:3}, at: __dev_queue_xmit+0x1778/0x3550 [ 82.890906] [ 82.890906] other info that might help us debug this: [ 82.890906] Possible unsafe locking scenario: [ 82.890906] [ 82.890906] CPU0 [ 82.890906] ---- [ 82.890906] lock(&sch->q.lock); [ 82.890906] lock(&sch->q.lock); [ 82.890906] [ 82.890906] *** DEADLOCK *** [ 82.890906] [..... other info removed for brevity....] Example setup (eth0->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 Another example(eth0->eth1->eth0) to recreate tc qdisc add dev eth0 root handle 1: htb default 30 tc filter add dev eth0 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth1 tc qdisc add dev eth1 root handle 1: htb default 30 tc filter add dev eth1 handle 1: protocol ip prio 2 matchall \ action mirred egress redirect dev eth0 We fix this by adding an owner field (CPU id) to struct Qdisc set after root qdisc is entered. When the softirq enters it a second time, if the qdisc owner is the same CPU, the packet is dropped to break the loop. Reported-by: Mingshuai Ren Closes: https://lore.kernel.org/netdev/20240314111713.5979-1-renmingshuai@huawei.com/ Fixes: 3bcb846ca4cf ("net: get rid of spin_trylock() in net_tx_action()") Fixes: e578d9c02587 ("net: sched: use counter to break reclassify loops") Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20240415210728.36949-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 + net/core/dev.c | 6 ++++++ net/sched/sch_generic.c | 1 + 3 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cefe0c4bdae3..41ca14e81d55 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,6 +117,7 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; diff --git a/net/core/dev.c b/net/core/dev.c index 984ff8b9d0e1..331848eca7d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3775,6 +3775,10 @@ no_lock_out: return rc; } + if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { + kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); + return NET_XMIT_DROP; + } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3814,7 +3818,9 @@ no_lock_out: qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { + WRITE_ONCE(q->owner, smp_processor_id()); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + WRITE_ONCE(q->owner, -1); if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ff5336493777..4a2c763e2d11 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -974,6 +974,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); -- cgit v1.2.3 From 752863bddacab6b5c5164b1df8c8b2e3a175ee28 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Apr 2024 16:47:43 +0200 Subject: block: propagate partition scanning errors to the BLKRRPART ioctl Commit 4601b4b130de ("block: reopen the device in blkdev_reread_part") lost the propagation of I/O errors from the low-level read of the partition table to the user space caller of the BLKRRPART. Apparently some user space relies on, so restore the propagation. This isn't exactly pretty as other block device open calls explicitly do not are about these errors, so add a new BLK_OPEN_STRICT_SCAN to opt into the error propagation. Fixes: 4601b4b130de ("block: reopen the device in blkdev_reread_part") Reported-by: Saranya Muruganandam Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Shin'ichiro Kawasaki Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20240417144743.2277601-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 29 +++++++++++++++++++---------- block/ioctl.c | 3 ++- include/linux/blkdev.h | 2 ++ 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index 7a5f611c3d2e..cea51dca8753 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -652,6 +652,14 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -670,20 +678,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) if (!atomic_read(&bdev->bd_openers)) set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); atomic_inc(&bdev->bd_openers); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { + /* + * Only return scanning errors if we are called from contexts + * that explicitly want them, e.g. the BLKRRPART ioctl. + */ + ret = bdev_disk_changed(disk, false); + if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { + blkdev_put_whole(bdev); + return ret; + } + } return 0; } -static void blkdev_put_whole(struct block_device *bdev) -{ - if (atomic_dec_and_test(&bdev->bd_openers)) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk); -} - static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; diff --git a/block/ioctl.c b/block/ioctl.c index a9028a2c2db5..f505f9c341eb 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -563,7 +563,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c3e8f7cf96be..d16320852c4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -128,6 +128,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* -- cgit v1.2.3