Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd

Pull iommufd updates from Jason Gunthorpe:
 "Two significant new items:

   - Allow reporting IOMMU HW events to userspace when the events are
     clearly linked to a device.

     This is linked to the VIOMMU object and is intended to be used by a
     VMM to forward HW events to the virtual machine as part of
     emulating a vIOMMU. ARM SMMUv3 is the first driver to use this
     mechanism. Like the existing fault events the data is delivered
     through a simple FD returning event records on read().

   - PASID support in VFIO.

     The "Process Address Space ID" is a PCI feature that allows the
     device to tag all PCI DMA operations with an ID. The IOMMU will
     then use the ID to select a unique translation for those DMAs. This
     is part of Intel's vIOMMU support as VT-D HW requires the
     hypervisor to manage each PASID entry.

     The support is generic so any VFIO user could attach any
     translation to a PASID, and the support should work on ARM SMMUv3
     as well. AMD requires additional driver work.

  Some minor updates, along with fixes:

   - Prevent using nested parents with fault's, no driver support today

   - Put a single "cookie_type" value in the iommu_domain to indicate
     what owns the various opaque owner fields"

* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (49 commits)
  iommufd: Test attach before detaching pasid
  iommufd: Fix iommu_vevent_header tables markup
  iommu: Convert unreachable() to BUG()
  iommufd: Balance veventq->num_events inc/dec
  iommufd: Initialize the flags of vevent in iommufd_viommu_report_event()
  iommufd/selftest: Add coverage for reporting max_pasid_log2 via IOMMU_HW_INFO
  iommufd: Extend IOMMU_GET_HW_INFO to report PASID capability
  vfio: VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT support pasid
  vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices
  ida: Add ida_find_first_range()
  iommufd/selftest: Add coverage for iommufd pasid attach/detach
  iommufd/selftest: Add test ops to test pasid attach/detach
  iommufd/selftest: Add a helper to get test device
  iommufd/selftest: Add set_dev_pasid in mock iommu
  iommufd: Allow allocating PASID-compatible domain
  iommu/vt-d: Add IOMMU_HWPT_ALLOC_PASID support
  iommufd: Enforce PASID-compatible domain for RID
  iommufd: Support pasid attach/replace
  iommufd: Enforce PASID-compatible domain in PASID path
  iommufd/device: Add pasid_attach array to track per-PASID attach
  ...
This commit is contained in:
Linus Torvalds
2025-04-01 18:03:46 -07:00
39 changed files with 3155 additions and 837 deletions

View File

@@ -55,6 +55,7 @@ enum {
IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
};
/**
@@ -392,6 +393,9 @@ struct iommu_vfio_ioas {
* Any domain attached to the non-PASID part of the
* device must also be flagged, otherwise attaching a
* PASID will blocked.
* For the user that wants to attach PASID, ioas is
* not recommended for both the non-PASID part
* and PASID part of the device.
* If IOMMU does not support PASID it will return
* error (-EOPNOTSUPP).
*/
@@ -608,9 +612,17 @@ enum iommu_hw_info_type {
* IOMMU_HWPT_GET_DIRTY_BITMAP
* IOMMU_HWPT_SET_DIRTY_TRACKING
*
* @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it
* when the struct
* iommu_hw_info::out_max_pasid_log2 is zero.
* @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it
* when the struct
* iommu_hw_info::out_max_pasid_log2 is zero.
*/
enum iommufd_hw_capabilities {
IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1,
IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2,
};
/**
@@ -626,6 +638,9 @@ enum iommufd_hw_capabilities {
* iommu_hw_info_type.
* @out_capabilities: Output the generic iommu capability info type as defined
* in the enum iommu_hw_capabilities.
* @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support.
* PCI devices turn to out_capabilities to check if the
* specific capabilities is supported or not.
* @__reserved: Must be 0
*
* Query an iommu type specific hardware information data from an iommu behind
@@ -649,7 +664,8 @@ struct iommu_hw_info {
__u32 data_len;
__aligned_u64 data_uptr;
__u32 out_data_type;
__u32 __reserved;
__u8 out_max_pasid_log2;
__u8 __reserved[3];
__aligned_u64 out_capabilities;
};
#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
@@ -1014,4 +1030,115 @@ struct iommu_ioas_change_process {
#define IOMMU_IOAS_CHANGE_PROCESS \
_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
/**
* enum iommu_veventq_flag - flag for struct iommufd_vevent_header
* @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs
*/
enum iommu_veventq_flag {
IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0),
};
/**
* struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
* @flags: Combination of enum iommu_veventq_flag
* @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of
* [0, INT_MAX] where the following index of INT_MAX is 0
*
* Each iommufd_vevent_header reports a sequence index of the following vEVENT:
*
* +----------------------+-------+----------------------+-------+---+-------+
* | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN |
* +----------------------+-------+----------------------+-------+---+-------+
*
* And this sequence index is expected to be monotonic to the sequence index of
* the previous vEVENT. If two adjacent sequence indexes has a delta larger than
* 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs:
*
* +-----+----------------------+-------+----------------------+-------+-----+
* | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... |
* +-----+----------------------+-------+----------------------+-------+-----+
*
* If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT
* providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header
* would be added to the tail, and no data would follow this header:
*
* +--+----------------------+-------+-----------------------------------------+
* |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} |
* +--+----------------------+-------+-----------------------------------------+
*/
struct iommufd_vevent_header {
__u32 flags;
__u32 sequence;
};
/**
* enum iommu_veventq_type - Virtual Event Queue Type
* @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
* @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue
*/
enum iommu_veventq_type {
IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1,
};
/**
* struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event
* (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3)
* @evt: 256-bit ARM SMMUv3 Event record, little-endian.
* Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec)
* - 0x04 C_BAD_STE
* - 0x06 F_STREAM_DISABLED
* - 0x08 C_BAD_SUBSTREAMID
* - 0x0a C_BAD_CD
* - 0x10 F_TRANSLATION
* - 0x11 F_ADDR_SIZE
* - 0x12 F_ACCESS
* - 0x13 F_PERMISSION
*
* StreamID field reports a virtual device ID. To receive a virtual event for a
* device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC.
*/
struct iommu_vevent_arm_smmuv3 {
__aligned_le64 evt[4];
};
/**
* struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
* @size: sizeof(struct iommu_veventq_alloc)
* @flags: Must be 0
* @viommu_id: virtual IOMMU ID to associate the vEVENTQ with
* @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
* @veventq_depth: Maximum number of events in the vEVENTQ
* @out_veventq_id: The ID of the new vEVENTQ
* @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
* successfully returned fd after using it
* @__reserved: Must be 0
*
* Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
* can have multiple FDs for different types, but is confined to one per @type.
* User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
* if there are vEVENTs available. A vEVENTQ will lose events due to overflow,
* if the number of the vEVENTs hits @veventq_depth.
*
* Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
* a type-specific data structure, in a normal case:
*
* +-+---------+-------+---------+-------+-----+---------+-------+-+
* | | header0 | data0 | header1 | data1 | ... | headerN | dataN | |
* +-+---------+-------+---------+-------+-----+---------+-------+-+
*
* unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to
* struct iommufd_vevent_header).
*/
struct iommu_veventq_alloc {
__u32 size;
__u32 flags;
__u32 viommu_id;
__u32 type;
__u32 veventq_depth;
__u32 out_veventq_id;
__u32 out_veventq_fd;
__u32 __reserved;
};
#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
#endif

View File

@@ -932,29 +932,34 @@ struct vfio_device_bind_iommufd {
* VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19,
* struct vfio_device_attach_iommufd_pt)
* @argsz: User filled size of this data.
* @flags: Must be 0.
* @flags: Flags for attach.
* @pt_id: Input the target id which can represent an ioas or a hwpt
* allocated via iommufd subsystem.
* Output the input ioas id or the attached hwpt id which could
* be the specified hwpt itself or a hwpt automatically created
* for the specified ioas by kernel during the attachment.
* @pasid: The pasid to be attached, only meaningful when
* VFIO_DEVICE_ATTACH_PASID is set in @flags
*
* Associate the device with an address space within the bound iommufd.
* Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only
* allowed on cdev fds.
*
* If a vfio device is currently attached to a valid hw_pagetable, without doing
* a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl
* passing in another hw_pagetable (hwpt) id is allowed. This action, also known
* as a hw_pagetable replacement, will replace the device's currently attached
* hw_pagetable with a new hw_pagetable corresponding to the given pt_id.
* If a vfio device or a pasid of this device is currently attached to a valid
* hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second
* VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed.
* This action, also known as a hw_pagetable replacement, will replace the
* currently attached hwpt of the device or the pasid of this device with a new
* hwpt corresponding to the given pt_id.
*
* Return: 0 on success, -errno on failure.
*/
struct vfio_device_attach_iommufd_pt {
__u32 argsz;
__u32 flags;
#define VFIO_DEVICE_ATTACH_PASID (1 << 0)
__u32 pt_id;
__u32 pasid;
};
#define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19)
@@ -963,17 +968,21 @@ struct vfio_device_attach_iommufd_pt {
* VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20,
* struct vfio_device_detach_iommufd_pt)
* @argsz: User filled size of this data.
* @flags: Must be 0.
* @flags: Flags for detach.
* @pasid: The pasid to be detached, only meaningful when
* VFIO_DEVICE_DETACH_PASID is set in @flags
*
* Remove the association of the device and its current associated address
* space. After it, the device should be in a blocking DMA state. This is only
* allowed on cdev fds.
* Remove the association of the device or a pasid of the device and its current
* associated address space. After it, the device or the pasid should be in a
* blocking DMA state. This is only allowed on cdev fds.
*
* Return: 0 on success, -errno on failure.
*/
struct vfio_device_detach_iommufd_pt {
__u32 argsz;
__u32 flags;
#define VFIO_DEVICE_DETACH_PASID (1 << 0)
__u32 pasid;
};
#define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20)