From d97708701434ce72968e771976aaf9d3438fcafd Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Wed, 15 Apr 2026 11:17:52 -0700 Subject: vfio/pci: Clean up DMABUFs before disabling function On device shutdown, make vfio_pci_core_close_device() call vfio_pci_dma_buf_cleanup() before the function is disabled via vfio_pci_core_disable(). This ensures that all access via DMABUFs is revoked before the function's BARs become inaccessible. This fixes an issue where, if the function is disabled first, a tiny window exists in which the function's MSE is cleared and yet BARs could still be accessed via the DMABUF. The resources would also be freed and up for grabs by a different driver. Fixes: 5d74781ebc86c ("vfio/pci: Add dma-buf export support for MMIO regions") Signed-off-by: Matt Evans Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260415181752.1027604-1-mattev@meta.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ad52abc46c04..3f8d093aacf8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -734,10 +734,10 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #if IS_ENABLED(CONFIG_EEH) eeh_dev_release(vdev->pdev); #endif - vfio_pci_core_disable(vdev); - vfio_pci_dma_buf_cleanup(vdev); + vfio_pci_core_disable(vdev); + mutex_lock(&vdev->igate); vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); -- cgit v1.2.3 From 903570835f12b7436ca0edb0a9ed351c0349121e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 14 Apr 2026 14:06:19 -0600 Subject: vfio/virtio: Convert list_lock from spinlock to mutex The list_lock spinlock with IRQ disabling was copied from the mlx5 vfio-pci variant driver, where it is justified by a hardirq async command completion callback that accesses the protected lists. The virtio driver has no such interrupt context usage; all list_lock acquisitions occur in process context via file read/write operations or state transitions under state_mutex. Convert list_lock to a mutex to be consistent with peer vfio-pci variant drivers (hisilicon, pds, qat, xe) which all use mutexes for equivalent migration data protection. This also fixes a mismatched spin_lock()/spin_unlock_irq() pair in virtiovf_read_device_context_chunk() that could incorrectly enable interrupts. Reported-by: Jinhui Guo Closes: https://lore.kernel.org/all/20260413073603.30538-1-guojinhui.liam@bytedance.com Fixes: 0bbc82e4ec79 ("vfio/virtio: Add support for the basic live migration functionality") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Alex Williamson Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/20260414200625.3601509-2-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/common.h | 2 +- drivers/vfio/pci/virtio/migrate.c | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index cb3d5e57d3a3..3ccbd49e6abe 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -68,7 +68,7 @@ struct virtiovf_migration_file { enum virtiovf_migf_state state; enum virtiovf_load_state load_state; /* synchronize access to the lists */ - spinlock_t list_lock; + struct mutex list_lock; struct list_head buf_list; struct list_head avail_list; struct virtiovf_data_buffer *buf; diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index 7e11834ad512..ac35e5881741 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -142,9 +142,9 @@ end: static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf) { - spin_lock_irq(&buf->migf->list_lock); + mutex_lock(&buf->migf->list_lock); list_add_tail(&buf->buf_elm, &buf->migf->avail_list); - spin_unlock_irq(&buf->migf->list_lock); + mutex_unlock(&buf->migf->list_lock); } static int @@ -170,21 +170,21 @@ virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length) INIT_LIST_HEAD(&free_list); - spin_lock_irq(&migf->list_lock); + mutex_lock(&migf->list_lock); list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { list_del_init(&buf->buf_elm); if (buf->allocated_length >= length) { - spin_unlock_irq(&migf->list_lock); + mutex_unlock(&migf->list_lock); goto found; } /* * Prevent holding redundant buffers. Put in a free - * list and call at the end not under the spin lock + * list and call at the end not under the mutex * (&migf->list_lock) to minimize its scope usage. */ list_add(&buf->buf_elm, &free_list); } - spin_unlock_irq(&migf->list_lock); + mutex_unlock(&migf->list_lock); buf = virtiovf_alloc_data_buffer(migf, length); found: @@ -295,6 +295,7 @@ static int virtiovf_release_file(struct inode *inode, struct file *filp) struct virtiovf_migration_file *migf = filp->private_data; virtiovf_disable_fd(migf); + mutex_destroy(&migf->list_lock); mutex_destroy(&migf->lock); kfree(migf); return 0; @@ -308,7 +309,7 @@ virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf, bool found = false; *end_of_data = false; - spin_lock_irq(&migf->list_lock); + mutex_lock(&migf->list_lock); if (list_empty(&migf->buf_list)) { *end_of_data = true; goto end; @@ -329,7 +330,7 @@ virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf, migf->state = VIRTIOVF_MIGF_STATE_ERROR; end: - spin_unlock_irq(&migf->list_lock); + mutex_unlock(&migf->list_lock); return found ? buf : NULL; } @@ -369,10 +370,10 @@ static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf, } if (*pos >= vhca_buf->start_pos + vhca_buf->length) { - spin_lock_irq(&vhca_buf->migf->list_lock); + mutex_lock(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); - spin_unlock_irq(&vhca_buf->migf->list_lock); + mutex_unlock(&vhca_buf->migf->list_lock); } return done; @@ -549,9 +550,9 @@ virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf, header_buf->length = sizeof(header); header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; - spin_lock_irq(&migf->list_lock); + mutex_lock(&migf->list_lock); list_add_tail(&header_buf->buf_elm, &migf->buf_list); - spin_unlock_irq(&migf->list_lock); + mutex_unlock(&migf->list_lock); return 0; } @@ -616,9 +617,9 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, buf->start_pos = buf->migf->max_pos; migf->max_pos += buf->length; - spin_lock(&migf->list_lock); + mutex_lock(&migf->list_lock); list_add_tail(&buf->buf_elm, &migf->buf_list); - spin_unlock_irq(&migf->list_lock); + mutex_unlock(&migf->list_lock); return 0; out_header: @@ -687,7 +688,7 @@ virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev, mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); - spin_lock_init(&migf->list_lock); + mutex_init(&migf->list_lock); migf->virtvdev = virtvdev; lockdep_assert_held(&virtvdev->state_mutex); @@ -1077,7 +1078,7 @@ virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev) mutex_init(&migf->lock); INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); - spin_lock_init(&migf->list_lock); + mutex_init(&migf->list_lock); buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE); if (IS_ERR(buf)) { -- cgit v1.2.3 From 61fcb51fc9d576ec367e8aea9c03dc6a746e395e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 14 Apr 2026 14:06:20 -0600 Subject: vfio/virtio: Use guard() for list_lock where applicable Convert list_lock mutex acquisitions to use guard() and scoped_guard() where the lock scope aligns with the function or block scope. This simplifies virtiovf_get_data_buff_from_pos() by replacing goto-based unwinding with direct returns. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Alex Williamson Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/20260414200625.3601509-3-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/migrate.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ac35e5881741..601ef4865ed4 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -142,9 +142,8 @@ end: static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf) { - mutex_lock(&buf->migf->list_lock); + guard(mutex)(&buf->migf->list_lock); list_add_tail(&buf->buf_elm, &buf->migf->avail_list); - mutex_unlock(&buf->migf->list_lock); } static int @@ -306,32 +305,27 @@ virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf, loff_t pos, bool *end_of_data) { struct virtiovf_data_buffer *buf; - bool found = false; *end_of_data = false; - mutex_lock(&migf->list_lock); + guard(mutex)(&migf->list_lock); + if (list_empty(&migf->buf_list)) { *end_of_data = true; - goto end; + return NULL; } buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer, buf_elm); if (pos >= buf->start_pos && - pos < buf->start_pos + buf->length) { - found = true; - goto end; - } + pos < buf->start_pos + buf->length) + return buf; /* * As we use a stream based FD we may expect having the data always * on first chunk */ migf->state = VIRTIOVF_MIGF_STATE_ERROR; - -end: - mutex_unlock(&migf->list_lock); - return found ? buf : NULL; + return NULL; } static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf, @@ -370,10 +364,9 @@ static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf, } if (*pos >= vhca_buf->start_pos + vhca_buf->length) { - mutex_lock(&vhca_buf->migf->list_lock); + guard(mutex)(&vhca_buf->migf->list_lock); list_del_init(&vhca_buf->buf_elm); list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); - mutex_unlock(&vhca_buf->migf->list_lock); } return done; @@ -550,9 +543,10 @@ virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf, header_buf->length = sizeof(header); header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; - mutex_lock(&migf->list_lock); - list_add_tail(&header_buf->buf_elm, &migf->buf_list); - mutex_unlock(&migf->list_lock); + + scoped_guard(mutex, &migf->list_lock) + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + return 0; } @@ -617,9 +611,10 @@ virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, buf->start_pos = buf->migf->max_pos; migf->max_pos += buf->length; - mutex_lock(&migf->list_lock); - list_add_tail(&buf->buf_elm, &migf->buf_list); - mutex_unlock(&migf->list_lock); + + scoped_guard(mutex, &migf->list_lock) + list_add_tail(&buf->buf_elm, &migf->buf_list); + return 0; out_header: -- cgit v1.2.3 From b5b268cb7868b598e53eeebd36174c8b27d4cd86 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 14 Apr 2026 14:06:21 -0600 Subject: vfio/virtio: Use guard() for migf->lock where applicable Convert migf->lock acquisitions in virtiovf_disable_fd() and virtiovf_save_read() to use guard(). In virtiovf_save_read() this eliminates the out_unlock label and multiple goto paths by allowing direct returns, and removes the need for the done variable to double as an error carrier. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Alex Williamson Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/20260414200625.3601509-4-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/migrate.c | 40 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index 601ef4865ed4..4b1e5502f0a7 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -224,10 +224,9 @@ static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf) static void virtiovf_disable_fd(struct virtiovf_migration_file *migf) { - mutex_lock(&migf->lock); + guard(mutex)(&migf->lock); migf->state = VIRTIOVF_MIGF_STATE_ERROR; migf->filp->f_pos = 0; - mutex_unlock(&migf->lock); } static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev) @@ -385,11 +384,10 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le return -ESPIPE; pos = &filp->f_pos; - mutex_lock(&migf->lock); - if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { - done = -ENODEV; - goto out_unlock; - } + guard(mutex)(&migf->lock); + + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) + return -ENODEV; while (len) { ssize_t count; @@ -398,34 +396,24 @@ static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t le if (first_loop_call) { first_loop_call = false; /* Temporary end of file as part of PRE_COPY */ - if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) { - done = -ENOMSG; - goto out_unlock; - } - if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) { - done = -EINVAL; - goto out_unlock; - } + if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) + return -ENOMSG; + if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) + return -EINVAL; } if (end_of_data) - goto out_unlock; + return done; - if (!vhca_buf) { - done = -EINVAL; - goto out_unlock; - } + if (!vhca_buf) + return -EINVAL; count = virtiovf_buf_read(vhca_buf, &buf, &len, pos); - if (count < 0) { - done = count; - goto out_unlock; - } + if (count < 0) + return count; done += count; } -out_unlock: - mutex_unlock(&migf->lock); return done; } -- cgit v1.2.3 From b0eab97305ae97190a605117095d84f12ecef187 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 14 Apr 2026 14:06:22 -0600 Subject: vfio/virtio: Use guard() for bar_mutex in legacy I/O Convert the bar_mutex acquisition in virtiovf_issue_legacy_rw_cmd() to use guard(), eliminating the out label and goto-based error paths in favor of direct returns. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Alex Williamson Reviewed-by: Yishai Hadas Link: https://lore.kernel.org/r/20260414200625.3601509-5-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/virtio/legacy_io.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 1ed349a55629..f022301e60d6 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -34,7 +34,9 @@ virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); /* offset within the relevant configuration area */ offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); - mutex_lock(&virtvdev->bar_mutex); + + guard(mutex)(&virtvdev->bar_mutex); + if (read) { if (common) ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, @@ -43,14 +45,12 @@ virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, count, bar0_buf + pos); if (ret) - goto out; + return ret; if (copy_to_user(buf, bar0_buf + pos, count)) - ret = -EFAULT; + return -EFAULT; } else { - if (copy_from_user(bar0_buf + pos, buf, count)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(bar0_buf + pos, buf, count)) + return -EFAULT; if (common) ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, @@ -59,8 +59,7 @@ virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, count, bar0_buf + pos); } -out: - mutex_unlock(&virtvdev->bar_mutex); + return ret; } -- cgit v1.2.3 From 64965b8a4274b82330433fe8888d999506a81a94 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 17 Apr 2026 09:28:12 -0600 Subject: vfio: replace vfio->device_class with a const struct class The class_create() call has been deprecated in favor of class_register() as the driver core now allows for a struct class to be in read-only memory. Replace vfio->device_class with a const struct class and drop the class_create() call. Compile tested with both CONFIG_VFIO_DEVICE_CDEV on and off (and CONFIG_VFIO on); found no errors/warns in dmesg. Link: https://lore.kernel.org/all/2023040244-duffel-pushpin-f738@gregkh/ Suggested-by: Greg Kroah-Hartman Signed-off-by: Jori Koolstra [Remove unused vfio_cdev_init() args] Signed-off-by: Alex Williamson Link: https://lore.kernel.org/r/20260417152814.18026-1-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 8 +------- drivers/vfio/vfio.h | 4 ++-- drivers/vfio/vfio_main.c | 27 ++++++++++++++++----------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 8ceca24ac136..54abf312cf04 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -293,14 +293,8 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, return 0; } -static char *vfio_device_devnode(const struct device *dev, umode_t *mode) +int vfio_cdev_init(void) { - return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); -} - -int vfio_cdev_init(struct class *device_class) -{ - device_class->devnode = vfio_device_devnode; return alloc_chrdev_region(&device_devt, 0, MINORMASK + 1, "vfio-dev"); } diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 0854f3fa1a22..e4b72e79b7e3 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -377,7 +377,7 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep); long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct vfio_device_bind_iommufd __user *arg); void vfio_df_unbind_iommufd(struct vfio_device_file *df); -int vfio_cdev_init(struct class *device_class); +int vfio_cdev_init(void); void vfio_cdev_cleanup(void); #else static inline void vfio_init_device_cdev(struct vfio_device *device) @@ -410,7 +410,7 @@ static inline void vfio_df_unbind_iommufd(struct vfio_device_file *df) { } -static inline int vfio_cdev_init(struct class *device_class) +static inline int vfio_cdev_init(void) { return 0; } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 8666f35fb3f0..6222376ab6ab 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -49,7 +49,6 @@ #define VFIO_MAGIC 0x5646494f /* "VFIO" */ static struct vfio { - struct class *device_class; struct ida device_ida; struct vfsmount *vfs_mount; int fs_count; @@ -64,6 +63,16 @@ MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. Thi static DEFINE_XARRAY(vfio_device_set_xa); +static char *vfio_device_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); +} + +static const struct class vfio_device_class = { + .name = "vfio-dev", + .devnode = vfio_device_devnode +}; + int vfio_assign_device_set(struct vfio_device *device, void *set_id) { unsigned long idx = (unsigned long)set_id; @@ -299,7 +308,7 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, device_initialize(&device->device); device->device.release = vfio_device_release; - device->device.class = vfio.device_class; + device->device.class = &vfio_device_class; device->device.parent = device->dev; return 0; @@ -1804,13 +1813,11 @@ static int __init vfio_init(void) goto err_virqfd; /* /sys/class/vfio-dev/vfioX */ - vfio.device_class = class_create("vfio-dev"); - if (IS_ERR(vfio.device_class)) { - ret = PTR_ERR(vfio.device_class); + ret = class_register(&vfio_device_class); + if (ret) goto err_dev_class; - } - ret = vfio_cdev_init(vfio.device_class); + ret = vfio_cdev_init(); if (ret) goto err_alloc_dev_chrdev; @@ -1819,8 +1826,7 @@ static int __init vfio_init(void) return 0; err_alloc_dev_chrdev: - class_destroy(vfio.device_class); - vfio.device_class = NULL; + class_unregister(&vfio_device_class); err_dev_class: vfio_virqfd_exit(); err_virqfd: @@ -1833,8 +1839,7 @@ static void __exit vfio_cleanup(void) vfio_debugfs_remove_root(); ida_destroy(&vfio.device_ida); vfio_cdev_cleanup(); - class_destroy(vfio.device_class); - vfio.device_class = NULL; + class_unregister(&vfio_device_class); vfio_virqfd_exit(); vfio_group_cleanup(); xa_destroy(&vfio_device_set_xa); -- cgit v1.2.3 From 5ea5880764cbb164afb17a62e76ca75dc371409d Mon Sep 17 00:00:00 2001 From: Prasanna Kumar T S M Date: Fri, 17 Apr 2026 14:27:56 -0600 Subject: vfio/cdx: Fix NULL pointer dereference in interrupt trigger path Add validation to ensure MSI is configured before accessing cdx_irqs array in vfio_cdx_set_msi_trigger(). Without this check, userspace can trigger a NULL pointer dereference by calling VFIO_DEVICE_SET_IRQS with VFIO_IRQ_SET_DATA_BOOL or VFIO_IRQ_SET_DATA_NONE flags before ever setting up interrupts via VFIO_IRQ_SET_DATA_EVENTFD. The vfio_cdx_msi_enable() function allocates the cdx_irqs array and sets config_msi to 1 only when called through the EVENTFD path. The trigger loop (for DATA_BOOL/DATA_NONE) assumed this had already been done, but there was no enforcement of this call ordering. This matches the protection used in the PCI VFIO driver where vfio_pci_set_msi_trigger() checks irq_is() before the trigger loop. Fixes: 848e447e000c ("vfio/cdx: add interrupt support") Cc: stable@vger.kernel.org Signed-off-by: Prasanna Kumar T S M Acked-by: Nipun Gupta Signed-off-by: Alex Williamson Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20260417202800.88287-2-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/intr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c index 8f4402cec9c5..c0eed065e8ef 100644 --- a/drivers/vfio/cdx/intr.c +++ b/drivers/vfio/cdx/intr.c @@ -175,6 +175,10 @@ static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, return ret; } + /* Ensure MSI is configured before accessing cdx_irqs */ + if (!vdev->config_msi) + return -EINVAL; + for (i = start; i < start + count; i++) { if (!vdev->cdx_irqs[i].trigger) continue; -- cgit v1.2.3 From 670e8864b1a218d72f08db40d0103adf38fa1d9b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 17 Apr 2026 14:27:57 -0600 Subject: vfio/cdx: Serialize VFIO_DEVICE_SET_IRQS with a per-device mutex vfio_cdx_set_msi_trigger() reads vdev->config_msi and operates on the vdev->cdx_irqs array based on its value, but provides no serialization against concurrent VFIO_DEVICE_SET_IRQS ioctls. Two callers can race such that one observes config_msi as set while another clears it and frees cdx_irqs via vfio_cdx_msi_disable(), resulting in a use-after-free of the cdx_irqs array. Add a cdx_irqs_lock mutex to struct vfio_cdx_device and acquire it in vfio_cdx_set_msi_trigger(), which is the single chokepoint through which all updates to config_msi, cdx_irqs, and msi_count flow, covering both the ioctl path and the close-device cleanup path. This keeps the test of config_msi atomic with the subsequent enable, disable, or trigger operations. Drop the pre-call !cdx_irqs test from vfio_cdx_irqs_cleanup() as part of this change: the optimization it provided is redundant with the !config_msi early-return inside vfio_cdx_msi_disable(), and leaving the test in place would be an unsynchronized read of state the new lock is meant to protect. Fixes: 848e447e000c ("vfio/cdx: add interrupt support") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Alex Williamson Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20260417202800.88287-3-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/intr.c | 9 ++------- drivers/vfio/cdx/main.c | 19 +++++++++++++++++++ drivers/vfio/cdx/private.h | 3 +++ 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c index c0eed065e8ef..6dfe0ced3bdd 100644 --- a/drivers/vfio/cdx/intr.c +++ b/drivers/vfio/cdx/intr.c @@ -152,6 +152,8 @@ static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, if (start + count > cdx_dev->num_msi) return -EINVAL; + guard(mutex)(&vdev->cdx_irqs_lock); + if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) { vfio_cdx_msi_disable(vdev); return 0; @@ -210,12 +212,5 @@ int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, /* Free All IRQs for the given device */ void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev) { - /* - * Device does not support any interrupt or the interrupts - * were not configured - */ - if (!vdev->cdx_irqs) - return; - vfio_cdx_set_msi_trigger(vdev, 0, 0, 0, VFIO_IRQ_SET_DATA_NONE, NULL); } diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 8ab97405b2bd..b31ed4be7bdc 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -8,6 +8,23 @@ #include "private.h" +static int vfio_cdx_init_dev(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + + mutex_init(&vdev->cdx_irqs_lock); + return 0; +} + +static void vfio_cdx_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + + mutex_destroy(&vdev->cdx_irqs_lock); +} + static int vfio_cdx_open_device(struct vfio_device *core_vdev) { struct vfio_cdx_device *vdev = @@ -273,6 +290,8 @@ static int vfio_cdx_mmap(struct vfio_device *core_vdev, static const struct vfio_device_ops vfio_cdx_ops = { .name = "vfio-cdx", + .init = vfio_cdx_init_dev, + .release = vfio_cdx_release_dev, .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index 172e48caa3a0..94374b5fc989 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -6,6 +6,8 @@ #ifndef VFIO_CDX_PRIVATE_H #define VFIO_CDX_PRIVATE_H +#include + #define VFIO_CDX_OFFSET_SHIFT 40 static inline u64 vfio_cdx_index_to_offset(u32 index) @@ -31,6 +33,7 @@ struct vfio_cdx_region { struct vfio_cdx_device { struct vfio_device vdev; struct vfio_cdx_region *regions; + struct mutex cdx_irqs_lock; struct vfio_cdx_irq *cdx_irqs; u32 flags; #define BME_SUPPORT BIT(0) -- cgit v1.2.3 From 30471982cd667972ba93ce894765d4b8544958e6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 17 Apr 2026 14:27:58 -0600 Subject: vfio/cdx: Consolidate MSI configured state onto cdx_irqs struct vfio_cdx_device carries three fields that track whether MSI has been configured: vdev->cdx_irqs (the allocated vector array), vdev-> msi_count (the array length), and vdev->config_msi (a boolean flag). The three are set together when vfio_cdx_msi_enable() succeeds and cleared together by vfio_cdx_msi_disable(). However, the error paths in vfio_cdx_msi_enable() free the cdx_irqs allocation on failure without resetting the pointer, leaving it stale and skewed from the other two fields until the next enable call overwrites it. Clear vdev->cdx_irqs to NULL alongside the kfree() in both error paths so the pointer consistently reflects the configured state. With that invariant restored and access to the MSI state serialized by cdx_irqs_lock, vdev->config_msi is fully redundant with (vdev->cdx_irqs != NULL). Drop the config_msi field and switch all readers to test cdx_irqs directly. Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Alex Williamson Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20260417202800.88287-4-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/intr.c | 29 ++++++++++++++--------------- drivers/vfio/cdx/private.h | 1 - 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c index 6dfe0ced3bdd..4439481fe633 100644 --- a/drivers/vfio/cdx/intr.c +++ b/drivers/vfio/cdx/intr.c @@ -32,26 +32,27 @@ static int vfio_cdx_msi_enable(struct vfio_cdx_device *vdev, int nvec) return -ENOMEM; ret = cdx_enable_msi(cdx_dev); - if (ret) { - kfree(vdev->cdx_irqs); - return ret; - } + if (ret) + goto err_free; /* Allocate cdx MSIs */ ret = msi_domain_alloc_irqs(dev, MSI_DEFAULT_DOMAIN, nvec); - if (ret) { - cdx_disable_msi(cdx_dev); - kfree(vdev->cdx_irqs); - return ret; - } + if (ret) + goto err_disable; for (msi_idx = 0; msi_idx < nvec; msi_idx++) vdev->cdx_irqs[msi_idx].irq_no = msi_get_virq(dev, msi_idx); vdev->msi_count = nvec; - vdev->config_msi = 1; return 0; + +err_disable: + cdx_disable_msi(cdx_dev); +err_free: + kfree(vdev->cdx_irqs); + vdev->cdx_irqs = NULL; + return ret; } static int vfio_cdx_msi_set_vector_signal(struct vfio_cdx_device *vdev, @@ -129,7 +130,7 @@ static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev) vfio_cdx_msi_set_block(vdev, 0, vdev->msi_count, NULL); - if (!vdev->config_msi) + if (!vdev->cdx_irqs) return; msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN); @@ -138,7 +139,6 @@ static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev) vdev->cdx_irqs = NULL; vdev->msi_count = 0; - vdev->config_msi = 0; } static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, @@ -163,7 +163,7 @@ static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, s32 *fds = data; int ret; - if (vdev->config_msi) + if (vdev->cdx_irqs) return vfio_cdx_msi_set_block(vdev, start, count, fds); ret = vfio_cdx_msi_enable(vdev, cdx_dev->num_msi); @@ -177,8 +177,7 @@ static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, return ret; } - /* Ensure MSI is configured before accessing cdx_irqs */ - if (!vdev->config_msi) + if (!vdev->cdx_irqs) return -EINVAL; for (i = start; i < start + count; i++) { diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index 94374b5fc989..4c00bf633356 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -38,7 +38,6 @@ struct vfio_cdx_device { u32 flags; #define BME_SUPPORT BIT(0) u32 msi_count; - u8 config_msi; }; #ifdef CONFIG_GENERIC_MSI_IRQ -- cgit v1.2.3