summaryrefslogtreecommitdiffstats
path: root/hw/virtio
diff options
context:
space:
mode:
Diffstat (limited to 'hw/virtio')
-rw-r--r--hw/virtio/Kconfig87
-rw-r--r--hw/virtio/meson.build67
-rw-r--r--hw/virtio/trace-events151
-rw-r--r--hw/virtio/trace.h1
-rw-r--r--hw/virtio/vhost-backend.c410
-rw-r--r--hw/virtio/vhost-iova-tree.c110
-rw-r--r--hw/virtio/vhost-iova-tree.h27
-rw-r--r--hw/virtio/vhost-scsi-pci.c104
-rw-r--r--hw/virtio/vhost-shadow-virtqueue.c769
-rw-r--r--hw/virtio/vhost-shadow-virtqueue.h139
-rw-r--r--hw/virtio/vhost-stub.c17
-rw-r--r--hw/virtio/vhost-user-blk-pci.c109
-rw-r--r--hw/virtio/vhost-user-fs-pci.c88
-rw-r--r--hw/virtio/vhost-user-fs.c336
-rw-r--r--hw/virtio/vhost-user-gpio-pci.c69
-rw-r--r--hw/virtio/vhost-user-gpio.c418
-rw-r--r--hw/virtio/vhost-user-i2c-pci.c69
-rw-r--r--hw/virtio/vhost-user-i2c.c287
-rw-r--r--hw/virtio/vhost-user-input-pci.c50
-rw-r--r--hw/virtio/vhost-user-rng-pci.c79
-rw-r--r--hw/virtio/vhost-user-rng.c299
-rw-r--r--hw/virtio/vhost-user-scsi-pci.c110
-rw-r--r--hw/virtio/vhost-user-vsock-pci.c86
-rw-r--r--hw/virtio/vhost-user-vsock.c180
-rw-r--r--hw/virtio/vhost-user.c2800
-rw-r--r--hw/virtio/vhost-vdpa.c1313
-rw-r--r--hw/virtio/vhost-vsock-common.c300
-rw-r--r--hw/virtio/vhost-vsock-pci.c96
-rw-r--r--hw/virtio/vhost-vsock.c239
-rw-r--r--hw/virtio/vhost.c1942
-rw-r--r--hw/virtio/virtio-9p-pci.c91
-rw-r--r--hw/virtio/virtio-balloon-pci.c88
-rw-r--r--hw/virtio/virtio-balloon.c1079
-rw-r--r--hw/virtio/virtio-blk-pci.c107
-rw-r--r--hw/virtio/virtio-bus.c372
-rw-r--r--hw/virtio/virtio-crypto-pci.c94
-rw-r--r--hw/virtio/virtio-crypto.c1250
-rw-r--r--hw/virtio/virtio-input-host-pci.c47
-rw-r--r--hw/virtio/virtio-input-pci.c155
-rw-r--r--hw/virtio/virtio-iommu-pci.c112
-rw-r--r--hw/virtio/virtio-iommu.c1425
-rw-r--r--hw/virtio/virtio-mem-pci.c161
-rw-r--r--hw/virtio/virtio-mem-pci.h35
-rw-r--r--hw/virtio/virtio-mem.c1386
-rw-r--r--hw/virtio/virtio-mmio.c862
-rw-r--r--hw/virtio/virtio-net-pci.c108
-rw-r--r--hw/virtio/virtio-pci.c2298
-rw-r--r--hw/virtio/virtio-pmem-pci.c127
-rw-r--r--hw/virtio/virtio-pmem-pci.h35
-rw-r--r--hw/virtio/virtio-pmem.c196
-rw-r--r--hw/virtio/virtio-rng-pci.c96
-rw-r--r--hw/virtio/virtio-rng.c289
-rw-r--r--hw/virtio/virtio-scsi-pci.c114
-rw-r--r--hw/virtio/virtio-serial-pci.c117
-rw-r--r--hw/virtio/virtio-stub.c42
-rw-r--r--hw/virtio/virtio.c4991
56 files changed, 26429 insertions, 0 deletions
diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
new file mode 100644
index 00000000..cbfd8c71
--- /dev/null
+++ b/hw/virtio/Kconfig
@@ -0,0 +1,87 @@
+config VIRTIO
+ bool
+
+config VIRTIO_RNG
+ bool
+ default y
+ depends on VIRTIO
+
+config VIRTIO_IOMMU
+ bool
+ default y
+ depends on PCI && VIRTIO
+
+config VIRTIO_PCI
+ bool
+ default y if PCI_DEVICES
+ depends on PCI
+ select VIRTIO
+
+config VIRTIO_MMIO
+ bool
+ select VIRTIO
+
+config VIRTIO_CCW
+ bool
+ select VIRTIO
+
+config VIRTIO_BALLOON
+ bool
+ default y
+ depends on VIRTIO
+
+config VIRTIO_CRYPTO
+ bool
+ default y
+ depends on VIRTIO
+
+config VIRTIO_PMEM_SUPPORTED
+ bool
+
+config VIRTIO_PMEM
+ bool
+ default y
+ depends on VIRTIO
+ depends on VIRTIO_PMEM_SUPPORTED
+ select MEM_DEVICE
+
+config VIRTIO_MEM_SUPPORTED
+ bool
+
+config VIRTIO_MEM
+ bool
+ default y
+ depends on VIRTIO
+ depends on LINUX
+ depends on VIRTIO_MEM_SUPPORTED
+ select MEM_DEVICE
+
+config VHOST_VSOCK
+ bool
+ default y
+ depends on VIRTIO && VHOST_KERNEL
+
+config VHOST_USER_VSOCK
+ bool
+ default y
+ depends on VIRTIO && VHOST_USER
+
+config VHOST_USER_I2C
+ bool
+ default y
+ depends on VIRTIO && VHOST_USER
+
+config VHOST_USER_RNG
+ bool
+ default y
+ depends on VIRTIO && VHOST_USER
+
+config VHOST_USER_FS
+ bool
+ default y
+ depends on VIRTIO && VHOST_USER
+
+config VHOST_USER_GPIO
+ bool
+ default y
+ depends on VIRTIO && VHOST_USER
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
new file mode 100644
index 00000000..dfed1e7a
--- /dev/null
+++ b/hw/virtio/meson.build
@@ -0,0 +1,67 @@
+softmmu_virtio_ss = ss.source_set()
+softmmu_virtio_ss.add(files('virtio-bus.c'))
+softmmu_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c'))
+softmmu_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c'))
+
+virtio_ss = ss.source_set()
+virtio_ss.add(files('virtio.c'))
+
+if have_vhost
+ virtio_ss.add(files('vhost.c', 'vhost-backend.c', 'vhost-iova-tree.c'))
+ if have_vhost_user
+ virtio_ss.add(files('vhost-user.c'))
+ endif
+ if have_vhost_vdpa
+ virtio_ss.add(files('vhost-vdpa.c', 'vhost-shadow-virtqueue.c'))
+ endif
+else
+ softmmu_virtio_ss.add(files('vhost-stub.c'))
+endif
+
+virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: files('virtio-pmem.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock.c', 'vhost-vsock-common.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock.c', 'vhost-vsock-common.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c'))
+virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c'))
+virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: files('vhost-user-gpio-pci.c'))
+
+virtio_pci_ss = ss.source_set()
+virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c'))
+
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT', if_true: files('virtio-input-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SCSI', if_true: files('virtio-scsi-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: files('virtio-serial-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: files('virtio-pmem-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu-pci.c'))
+virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem-pci.c'))
+
+virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss)
+
+specific_ss.add_all(when: 'CONFIG_VIRTIO', if_true: virtio_ss)
+softmmu_ss.add_all(when: 'CONFIG_VIRTIO', if_true: softmmu_virtio_ss)
+softmmu_ss.add(when: 'CONFIG_VIRTIO', if_false: files('vhost-stub.c'))
+softmmu_ss.add(when: 'CONFIG_VIRTIO', if_false: files('virtio-stub.c'))
+softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
+softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('virtio-stub.c'))
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
new file mode 100644
index 00000000..14fc5b9b
--- /dev/null
+++ b/hw/virtio/trace-events
@@ -0,0 +1,151 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# vhost.c
+vhost_commit(bool started, bool changed) "Started: %d Changed: %d"
+vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
+vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
+vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
+vhost_section(const char *name) "%s"
+vhost_reject_section(const char *name, int d) "%s:%d"
+vhost_iotlb_miss(void *dev, int step) "%p step %d"
+vhost_dev_cleanup(void *dev) "%p"
+vhost_dev_start(void *dev, const char *name, bool vrings) "%p:%s vrings:%d"
+vhost_dev_stop(void *dev, const char *name, bool vrings) "%p:%s vrings:%d"
+
+
+# vhost-user.c
+vhost_user_postcopy_end_entry(void) ""
+vhost_user_postcopy_end_exit(void) ""
+vhost_user_postcopy_fault_handler(const char *name, uint64_t fault_address, int nregions) "%s: @0x%"PRIx64" nregions:%d"
+vhost_user_postcopy_fault_handler_loop(int i, uint64_t client_base, uint64_t size) "%d: client 0x%"PRIx64" +0x%"PRIx64
+vhost_user_postcopy_fault_handler_found(int i, uint64_t region_offset, uint64_t rb_offset) "%d: region_offset: 0x%"PRIx64" rb_offset:0x%"PRIx64
+vhost_user_postcopy_listen(void) ""
+vhost_user_set_mem_table_postcopy(uint64_t client_addr, uint64_t qhva, int reply_i, int region_i) "client:0x%"PRIx64" for hva: 0x%"PRIx64" reply %d region %d"
+vhost_user_set_mem_table_withfd(int index, const char *name, uint64_t memory_size, uint64_t guest_phys_addr, uint64_t userspace_addr, uint64_t offset) "%d:%s: size:0x%"PRIx64" GPA:0x%"PRIx64" QVA/userspace:0x%"PRIx64" RB offset:0x%"PRIx64
+vhost_user_postcopy_waker(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64
+vhost_user_postcopy_waker_found(uint64_t client_addr) "0x%"PRIx64
+vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64
+vhost_user_read(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32""
+vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32""
+vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p"
+
+# vhost-vdpa.c
+vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8
+vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8
+vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
+vhost_vdpa_listener_commit(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8
+vhost_vdpa_listener_region_add(void *vdpa, uint64_t iova, uint64_t llend, void *vaddr, bool readonly) "vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64" vaddr: %p read-only: %d"
+vhost_vdpa_listener_region_del(void *vdpa, uint64_t iova, uint64_t llend) "vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64
+vhost_vdpa_add_status(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8
+vhost_vdpa_init(void *dev, void *vdpa) "dev: %p vdpa: %p"
+vhost_vdpa_cleanup(void *dev, void *vdpa) "dev: %p vdpa: %p"
+vhost_vdpa_memslots_limit(void *dev, int ret) "dev: %p = 0x%x"
+vhost_vdpa_set_mem_table(void *dev, uint32_t nregions, uint32_t padding) "dev: %p nregions: %"PRIu32" padding: 0x%"PRIx32
+vhost_vdpa_dump_regions(void *dev, int i, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, uint64_t flags_padding) "dev: %p %d: guest_phys_addr: 0x%"PRIx64" memory_size: 0x%"PRIx64" userspace_addr: 0x%"PRIx64" flags_padding: 0x%"PRIx64
+vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
+vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32
+vhost_vdpa_reset_device(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8
+vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d"
+vhost_vdpa_set_vring_ready(void *dev) "dev: %p"
+vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s"
+vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32
+vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32
+vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d"
+vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p"
+vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64
+vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
+vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
+vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u"
+vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
+vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d"
+vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64
+vhost_vdpa_set_owner(void *dev) "dev: %p"
+vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64
+vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64
+
+# virtio.c
+virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
+virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) "vq %p elem %p len %u idx %u"
+virtqueue_flush(void *vq, unsigned int count) "vq %p count %u"
+virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u"
+virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p"
+virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p"
+virtio_notify(void *vdev, void *vq) "vdev %p vq %p"
+virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u"
+
+# virtio-rng.c
+virtio_rng_guest_not_ready(void *rng) "rng %p: guest not ready"
+virtio_rng_cpu_is_stopped(void *rng, int size) "rng %p: cpu is stopped, dropping %d bytes"
+virtio_rng_popped(void *rng) "rng %p: elem popped"
+virtio_rng_pushed(void *rng, size_t len) "rng %p: %zd bytes pushed"
+virtio_rng_request(void *rng, size_t size, unsigned quota) "rng %p: %zd bytes requested, %u bytes quota left"
+virtio_rng_vm_state_change(void *rng, int running, int state) "rng %p: state change to running %d state %d"
+
+# virtio-balloon.c
+#
+virtio_balloon_bad_addr(uint64_t gpa) "0x%"PRIx64
+virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s gpa: 0x%"PRIx64
+virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d"
+virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d"
+virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d"
+
+# virtio-mmio.c
+virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64
+virtio_mmio_write_offset(uint64_t offset, uint64_t value) "virtio_mmio_write offset 0x%" PRIx64 " value 0x%" PRIx64
+virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 " shift %d"
+virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" PRIx64 " max %d"
+virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d"
+
+# virtio-pci.c
+virtio_pci_notify(uint16_t vector) "virtio_pci_notify vec 0x%x"
+virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)"
+virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)"
+
+# hw/virtio/virtio-iommu.c
+virtio_iommu_device_reset(void) "reset!"
+virtio_iommu_system_reset(void) "system reset!"
+virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64
+virtio_iommu_device_status(uint8_t status) "driver status = %d"
+virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x"
+virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x"
+virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
+virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
+virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
+virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
+virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
+virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
+virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
+virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
+virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64
+virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t start, uint64_t end) "dev= %d, type=%d start=0x%"PRIx64" end=0x%"PRIx64
+virtio_iommu_notify_map(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64" flags=%d"
+virtio_iommu_notify_unmap(const char *name, uint64_t virt_start, uint64_t virt_end) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64
+virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64
+virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s"
+virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s"
+virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
+
+# virtio-mem.c
+virtio_mem_send_response(uint16_t type) "type=%" PRIu16
+virtio_mem_plug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
+virtio_mem_unplug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
+virtio_mem_unplugged_all(void) ""
+virtio_mem_unplug_all_request(void) ""
+virtio_mem_resized_usable_region(uint64_t old_size, uint64_t new_size) "old_size=0x%" PRIx64 "new_size=0x%" PRIx64
+virtio_mem_state_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
+virtio_mem_state_response(uint16_t state) "state=%" PRIu16
+
+# virtio-pmem.c
+virtio_pmem_flush_request(void) "flush request"
+virtio_pmem_response(void) "flush response"
+virtio_pmem_flush_done(int type) "fsync return=%d"
+
+# virtio-gpio.c
+virtio_gpio_start(void) "start"
+virtio_gpio_stop(void) "stop"
+virtio_gpio_set_status(uint8_t status) "0x%x"
diff --git a/hw/virtio/trace.h b/hw/virtio/trace.h
new file mode 100644
index 00000000..5d709706
--- /dev/null
+++ b/hw/virtio/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_virtio.h"
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
new file mode 100644
index 00000000..8e581575
--- /dev/null
+++ b/hw/virtio/vhost-backend.c
@@ -0,0 +1,410 @@
+/*
+ * vhost-backend
+ *
+ * Copyright (c) 2013 Virtual Open Systems Sarl.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-backend.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "standard-headers/linux/vhost_types.h"
+
+#include "hw/virtio/vhost-vdpa.h"
+#ifdef CONFIG_VHOST_KERNEL
+#include <linux/vhost.h>
+#include <sys/ioctl.h>
+
+static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request,
+ void *arg)
+{
+ int fd = (uintptr_t) dev->opaque;
+ int ret;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
+
+ ret = ioctl(fd, request, arg);
+ return ret < 0 ? -errno : ret;
+}
+
+static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error **errp)
+{
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
+
+ dev->opaque = opaque;
+
+ return 0;
+}
+
+static int vhost_kernel_cleanup(struct vhost_dev *dev)
+{
+ int fd = (uintptr_t) dev->opaque;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL);
+
+ return close(fd) < 0 ? -errno : 0;
+}
+
+static int vhost_kernel_memslots_limit(struct vhost_dev *dev)
+{
+ int limit = 64;
+ char *s;
+
+ if (g_file_get_contents("/sys/module/vhost/parameters/max_mem_regions",
+ &s, NULL, NULL)) {
+ uint64_t val = g_ascii_strtoull(s, NULL, 10);
+ if (val < INT_MAX && val > 0) {
+ g_free(s);
+ return val;
+ }
+ error_report("ignoring invalid max_mem_regions value in vhost module:"
+ " %s", s);
+ }
+ g_free(s);
+ return limit;
+}
+
+static int vhost_kernel_net_set_backend(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_NET_SET_BACKEND, file);
+}
+
+static int vhost_kernel_scsi_set_endpoint(struct vhost_dev *dev,
+ struct vhost_scsi_target *target)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_SET_ENDPOINT, target);
+}
+
+static int vhost_kernel_scsi_clear_endpoint(struct vhost_dev *dev,
+ struct vhost_scsi_target *target)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_CLEAR_ENDPOINT, target);
+}
+
+static int vhost_kernel_scsi_get_abi_version(struct vhost_dev *dev, int *version)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_GET_ABI_VERSION, version);
+}
+
+static int vhost_kernel_set_log_base(struct vhost_dev *dev, uint64_t base,
+ struct vhost_log *log)
+{
+ return vhost_kernel_call(dev, VHOST_SET_LOG_BASE, &base);
+}
+
+static int vhost_kernel_set_mem_table(struct vhost_dev *dev,
+ struct vhost_memory *mem)
+{
+ return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem);
+}
+
+static int vhost_kernel_set_vring_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_ADDR, addr);
+}
+
+static int vhost_kernel_set_vring_endian(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_ENDIAN, ring);
+}
+
+static int vhost_kernel_set_vring_num(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_NUM, ring);
+}
+
+static int vhost_kernel_set_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
+static int vhost_kernel_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_GET_VRING_BASE, ring);
+}
+
+static int vhost_kernel_set_vring_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file);
+}
+
+static int vhost_kernel_set_vring_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
+static int vhost_kernel_set_vring_err(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_ERR, file);
+}
+
+static int vhost_kernel_set_vring_busyloop_timeout(struct vhost_dev *dev,
+ struct vhost_vring_state *s)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
+}
+
+static int vhost_kernel_set_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features);
+}
+
+static int vhost_kernel_set_backend_cap(struct vhost_dev *dev)
+{
+ uint64_t features;
+ uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2;
+ int r;
+
+ if (vhost_kernel_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
+ return 0;
+ }
+
+ features &= f;
+ r = vhost_kernel_call(dev, VHOST_SET_BACKEND_FEATURES,
+ &features);
+ if (r) {
+ return 0;
+ }
+
+ dev->backend_cap = features;
+
+ return 0;
+}
+
+static int vhost_kernel_get_features(struct vhost_dev *dev,
+ uint64_t *features)
+{
+ return vhost_kernel_call(dev, VHOST_GET_FEATURES, features);
+}
+
+static int vhost_kernel_set_owner(struct vhost_dev *dev)
+{
+ return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL);
+}
+
+static int vhost_kernel_reset_device(struct vhost_dev *dev)
+{
+ return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
+}
+
+static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
+{
+ assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
+
+ return idx - dev->vq_index;
+}
+
+static int vhost_kernel_vsock_set_guest_cid(struct vhost_dev *dev,
+ uint64_t guest_cid)
+{
+ return vhost_kernel_call(dev, VHOST_VSOCK_SET_GUEST_CID, &guest_cid);
+}
+
+static int vhost_kernel_vsock_set_running(struct vhost_dev *dev, int start)
+{
+ return vhost_kernel_call(dev, VHOST_VSOCK_SET_RUNNING, &start);
+}
+
+static void vhost_kernel_iotlb_read(void *opaque)
+{
+ struct vhost_dev *dev = opaque;
+ ssize_t len;
+
+ if (dev->backend_cap &
+ (0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)) {
+ struct vhost_msg_v2 msg;
+
+ while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
+ if (len < sizeof msg) {
+ error_report("Wrong vhost message len: %d", (int)len);
+ break;
+ }
+ if (msg.type != VHOST_IOTLB_MSG_V2) {
+ error_report("Unknown vhost iotlb message type");
+ break;
+ }
+
+ vhost_backend_handle_iotlb_msg(dev, &msg.iotlb);
+ }
+ } else {
+ struct vhost_msg msg;
+
+ while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
+ if (len < sizeof msg) {
+ error_report("Wrong vhost message len: %d", (int)len);
+ break;
+ }
+ if (msg.type != VHOST_IOTLB_MSG) {
+ error_report("Unknown vhost iotlb message type");
+ break;
+ }
+
+ vhost_backend_handle_iotlb_msg(dev, &msg.iotlb);
+ }
+ }
+}
+
+static int vhost_kernel_send_device_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *imsg)
+{
+ if (dev->backend_cap & (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)) {
+ struct vhost_msg_v2 msg = {};
+
+ msg.type = VHOST_IOTLB_MSG_V2;
+ msg.iotlb = *imsg;
+
+ if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+ error_report("Fail to update device iotlb");
+ return -EFAULT;
+ }
+ } else {
+ struct vhost_msg msg = {};
+
+ msg.type = VHOST_IOTLB_MSG;
+ msg.iotlb = *imsg;
+
+ if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+ error_report("Fail to update device iotlb");
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
+ int enabled)
+{
+ if (enabled)
+ qemu_set_fd_handler((uintptr_t)dev->opaque,
+ vhost_kernel_iotlb_read, NULL, dev);
+ else
+ qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
+}
+
+const VhostOps kernel_ops = {
+ .backend_type = VHOST_BACKEND_TYPE_KERNEL,
+ .vhost_backend_init = vhost_kernel_init,
+ .vhost_backend_cleanup = vhost_kernel_cleanup,
+ .vhost_backend_memslots_limit = vhost_kernel_memslots_limit,
+ .vhost_net_set_backend = vhost_kernel_net_set_backend,
+ .vhost_scsi_set_endpoint = vhost_kernel_scsi_set_endpoint,
+ .vhost_scsi_clear_endpoint = vhost_kernel_scsi_clear_endpoint,
+ .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version,
+ .vhost_set_log_base = vhost_kernel_set_log_base,
+ .vhost_set_mem_table = vhost_kernel_set_mem_table,
+ .vhost_set_vring_addr = vhost_kernel_set_vring_addr,
+ .vhost_set_vring_endian = vhost_kernel_set_vring_endian,
+ .vhost_set_vring_num = vhost_kernel_set_vring_num,
+ .vhost_set_vring_base = vhost_kernel_set_vring_base,
+ .vhost_get_vring_base = vhost_kernel_get_vring_base,
+ .vhost_set_vring_kick = vhost_kernel_set_vring_kick,
+ .vhost_set_vring_call = vhost_kernel_set_vring_call,
+ .vhost_set_vring_err = vhost_kernel_set_vring_err,
+ .vhost_set_vring_busyloop_timeout =
+ vhost_kernel_set_vring_busyloop_timeout,
+ .vhost_set_features = vhost_kernel_set_features,
+ .vhost_get_features = vhost_kernel_get_features,
+ .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
+ .vhost_set_owner = vhost_kernel_set_owner,
+ .vhost_reset_device = vhost_kernel_reset_device,
+ .vhost_get_vq_index = vhost_kernel_get_vq_index,
+ .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
+ .vhost_vsock_set_running = vhost_kernel_vsock_set_running,
+ .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
+ .vhost_send_device_iotlb_msg = vhost_kernel_send_device_iotlb_msg,
+};
+#endif
+
+int vhost_backend_update_device_iotlb(struct vhost_dev *dev,
+ uint64_t iova, uint64_t uaddr,
+ uint64_t len,
+ IOMMUAccessFlags perm)
+{
+ struct vhost_iotlb_msg imsg;
+
+ imsg.iova = iova;
+ imsg.uaddr = uaddr;
+ imsg.size = len;
+ imsg.type = VHOST_IOTLB_UPDATE;
+
+ switch (perm) {
+ case IOMMU_RO:
+ imsg.perm = VHOST_ACCESS_RO;
+ break;
+ case IOMMU_WO:
+ imsg.perm = VHOST_ACCESS_WO;
+ break;
+ case IOMMU_RW:
+ imsg.perm = VHOST_ACCESS_RW;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (dev->vhost_ops && dev->vhost_ops->vhost_send_device_iotlb_msg)
+ return dev->vhost_ops->vhost_send_device_iotlb_msg(dev, &imsg);
+
+ return -ENODEV;
+}
+
+int vhost_backend_invalidate_device_iotlb(struct vhost_dev *dev,
+ uint64_t iova, uint64_t len)
+{
+ struct vhost_iotlb_msg imsg;
+
+ imsg.iova = iova;
+ imsg.size = len;
+ imsg.type = VHOST_IOTLB_INVALIDATE;
+
+ if (dev->vhost_ops && dev->vhost_ops->vhost_send_device_iotlb_msg)
+ return dev->vhost_ops->vhost_send_device_iotlb_msg(dev, &imsg);
+
+ return -ENODEV;
+}
+
+int vhost_backend_handle_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *imsg)
+{
+ int ret = 0;
+
+ if (unlikely(!dev->vdev)) {
+ error_report("Unexpected IOTLB message when virtio device is stopped");
+ return -EINVAL;
+ }
+
+ switch (imsg->type) {
+ case VHOST_IOTLB_MISS:
+ ret = vhost_device_iotlb_miss(dev, imsg->iova,
+ imsg->perm != VHOST_ACCESS_RO);
+ break;
+ case VHOST_IOTLB_ACCESS_FAIL:
+ /* FIXME: report device iotlb error */
+ error_report("Access failure IOTLB message type not supported");
+ ret = -ENOTSUP;
+ break;
+ case VHOST_IOTLB_UPDATE:
+ case VHOST_IOTLB_INVALIDATE:
+ default:
+ error_report("Unexpected IOTLB message type");
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index 00000000..3d03395a
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -0,0 +1,110 @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size()
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+ /* First addressable iova address in the device */
+ uint64_t iova_first;
+
+ /* Last addressable iova address in the device */
+ uint64_t iova_last;
+
+ /* IOVA address to qemu memory maps. */
+ IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+ VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+ /* Some devices do not like 0 addresses */
+ tree->iova_first = MAX(iova_first, iova_min_addr);
+ tree->iova_last = iova_last;
+
+ tree->iova_taddr_map = iova_tree_new();
+ return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+ iova_tree_destroy(iova_tree->iova_taddr_map);
+ g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+ const DMAMap *map)
+{
+ return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+ /* Some vhost devices do not like addr 0. Skip first page */
+ hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size();
+
+ if (map->translated_addr + map->size < map->translated_addr ||
+ map->perm == IOMMU_NONE) {
+ return IOVA_ERR_INVALID;
+ }
+
+ /* Allocate a node in IOVA address */
+ return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+ tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map)
+{
+ iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index 00000000..4adfd79f
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -0,0 +1,27 @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+ const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map);
+
+#endif
diff --git a/hw/virtio/vhost-scsi-pci.c b/hw/virtio/vhost-scsi-pci.c
new file mode 100644
index 00000000..08980bc2
--- /dev/null
+++ b/hw/virtio/vhost-scsi-pci.c
@@ -0,0 +1,104 @@
+/*
+ * Vhost scsi PCI bindings
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * Changes for QEMU mainline + tcm_vhost kernel upstream:
+ * Nicholas Bellinger <nab@risingtidesystems.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "standard-headers/linux/virtio_pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-scsi.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostSCSIPCI VHostSCSIPCI;
+
+/*
+ * vhost-scsi-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_SCSI_PCI "vhost-scsi-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostSCSIPCI, VHOST_SCSI_PCI,
+ TYPE_VHOST_SCSI_PCI)
+
+struct VHostSCSIPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostSCSI vdev;
+};
+
+static Property vhost_scsi_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
+
+ if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+ conf->num_queues =
+ virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED);
+ }
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_scsi_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_scsi_pci_realize;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, vhost_scsi_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI;
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_scsi_pci_instance_init(Object *obj)
+{
+ VHostSCSIPCI *dev = VHOST_SCSI_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_SCSI);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_scsi_pci_info = {
+ .base_name = TYPE_VHOST_SCSI_PCI,
+ .generic_name = "vhost-scsi-pci",
+ .transitional_name = "vhost-scsi-pci-transitional",
+ .non_transitional_name = "vhost-scsi-pci-non-transitional",
+ .instance_size = sizeof(VHostSCSIPCI),
+ .instance_init = vhost_scsi_pci_instance_init,
+ .class_init = vhost_scsi_pci_class_init,
+};
+
+static void vhost_scsi_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_scsi_pci_info);
+}
+
+type_init(vhost_scsi_pci_register)
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index 00000000..5bd14cad
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -0,0 +1,769 @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
+#include "linux-headers/linux/vhost.h"
+
+/**
+ * Validate the transport device features that both guests can use with the SVQ
+ * and SVQs can use with the device.
+ *
+ * @dev_features: The features
+ * @errp: Error pointer
+ */
+bool vhost_svq_valid_features(uint64_t features, Error **errp)
+{
+ bool ok = true;
+ uint64_t svq_features = features;
+
+ for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
+ ++b) {
+ switch (b) {
+ case VIRTIO_F_ANY_LAYOUT:
+ case VIRTIO_RING_F_EVENT_IDX:
+ continue;
+
+ case VIRTIO_F_ACCESS_PLATFORM:
+ /* SVQ trust in the host's IOMMU to translate addresses */
+ case VIRTIO_F_VERSION_1:
+ /* SVQ trust that the guest vring is little endian */
+ if (!(svq_features & BIT_ULL(b))) {
+ svq_features |= BIT_ULL(b);
+ ok = false;
+ }
+ continue;
+
+ default:
+ if (svq_features & BIT_ULL(b)) {
+ svq_features &= ~BIT_ULL(b);
+ ok = false;
+ }
+ }
+ }
+
+ if (!ok) {
+ error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
+ ", ok: 0x%"PRIx64, features, svq_features);
+ }
+ return ok;
+}
+
+/**
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+ return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+ hwaddr *addrs, const struct iovec *iovec,
+ size_t num)
+{
+ if (num == 0) {
+ return true;
+ }
+
+ for (size_t i = 0; i < num; ++i) {
+ DMAMap needle = {
+ .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+ .size = iovec[i].iov_len,
+ };
+ Int128 needle_last, map_last;
+ size_t off;
+
+ const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+ /*
+ * Map cannot be NULL since iova map contains all guest space and
+ * qemu already has a physical address mapped
+ */
+ if (unlikely(!map)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Invalid address 0x%"HWADDR_PRIx" given by guest",
+ needle.translated_addr);
+ return false;
+ }
+
+ off = needle.translated_addr - map->translated_addr;
+ addrs[i] = map->iova + off;
+
+ needle_last = int128_add(int128_make64(needle.translated_addr),
+ int128_make64(iovec[i].iov_len));
+ map_last = int128_make64(map->translated_addr + map->size);
+ if (unlikely(int128_gt(needle_last, map_last))) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Guest buffer expands over iova range");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Write descriptors to SVQ vring
+ *
+ * @svq: The shadow virtqueue
+ * @sg: Cache for hwaddr
+ * @iovec: The iovec from the guest
+ * @num: iovec length
+ * @more_descs: True if more descriptors come in the chain
+ * @write: True if they are writeable descriptors
+ *
+ * Return true if success, false otherwise and print error.
+ */
+static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
+ const struct iovec *iovec, size_t num,
+ bool more_descs, bool write)
+{
+ uint16_t i = svq->free_head, last = svq->free_head;
+ unsigned n;
+ uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+ vring_desc_t *descs = svq->vring.desc;
+ bool ok;
+
+ if (num == 0) {
+ return true;
+ }
+
+ ok = vhost_svq_translate_addr(svq, sg, iovec, num);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ for (n = 0; n < num; n++) {
+ if (more_descs || (n + 1 < num)) {
+ descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+ descs[i].next = cpu_to_le16(svq->desc_next[i]);
+ } else {
+ descs[i].flags = flags;
+ }
+ descs[i].addr = cpu_to_le64(sg[n]);
+ descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+ last = i;
+ i = cpu_to_le16(svq->desc_next[i]);
+ }
+
+ svq->free_head = le16_to_cpu(svq->desc_next[last]);
+ return true;
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+ const struct iovec *out_sg, size_t out_num,
+ const struct iovec *in_sg, size_t in_num,
+ unsigned *head)
+{
+ unsigned avail_idx;
+ vring_avail_t *avail = svq->vring.avail;
+ bool ok;
+ g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
+
+ *head = svq->free_head;
+
+ /* We need some descriptors here */
+ if (unlikely(!out_num && !in_num)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Guest provided element with no descriptors");
+ return false;
+ }
+
+ ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
+ false);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
+ if (unlikely(!ok)) {
+ return false;
+ }
+
+ /*
+ * Put the entry in the available array (but don't update avail->idx until
+ * they do sync).
+ */
+ avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+ avail->ring[avail_idx] = cpu_to_le16(*head);
+ svq->shadow_avail_idx++;
+
+ /* Update the avail index after write the descriptor */
+ smp_wmb();
+ avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+ return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+ bool needs_kick;
+
+ /*
+ * We need to expose the available array entries before checking the used
+ * flags
+ */
+ smp_mb();
+
+ if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ uint16_t avail_event = *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]);
+ needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, svq->shadow_avail_idx - 1);
+ } else {
+ needs_kick = !(svq->vring.used->flags & VRING_USED_F_NO_NOTIFY);
+ }
+
+ if (!needs_kick) {
+ return;
+ }
+
+ event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Add an element to a SVQ.
+ *
+ * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
+ */
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+ size_t out_num, const struct iovec *in_sg, size_t in_num,
+ VirtQueueElement *elem)
+{
+ unsigned qemu_head;
+ unsigned ndescs = in_num + out_num;
+ bool ok;
+
+ if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
+ return -ENOSPC;
+ }
+
+ ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
+ if (unlikely(!ok)) {
+ return -EINVAL;
+ }
+
+ svq->desc_state[qemu_head].elem = elem;
+ svq->desc_state[qemu_head].ndescs = ndescs;
+ vhost_svq_kick(svq);
+ return 0;
+}
+
+/* Convenience wrapper to add a guest's element to SVQ */
+static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
+ VirtQueueElement *elem)
+{
+ return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
+ elem->in_num, elem);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+ /* Clear event notifier */
+ event_notifier_test_and_clear(&svq->svq_kick);
+
+ /* Forward to the device as many available buffers as possible */
+ do {
+ virtio_queue_set_notification(svq->vq, false);
+
+ while (true) {
+ g_autofree VirtQueueElement *elem = NULL;
+ int r;
+
+ if (svq->next_guest_avail_elem) {
+ elem = g_steal_pointer(&svq->next_guest_avail_elem);
+ } else {
+ elem = virtqueue_pop(svq->vq, sizeof(*elem));
+ }
+
+ if (!elem) {
+ break;
+ }
+
+ if (svq->ops) {
+ r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
+ } else {
+ r = vhost_svq_add_element(svq, elem);
+ }
+ if (unlikely(r != 0)) {
+ if (r == -ENOSPC) {
+ /*
+ * This condition is possible since a contiguous buffer in
+ * GPA does not imply a contiguous buffer in qemu's VA
+ * scatter-gather segments. If that happens, the buffer
+ * exposed to the device needs to be a chain of descriptors
+ * at this moment.
+ *
+ * SVQ cannot hold more available buffers if we are here:
+ * queue the current guest descriptor and ignore kicks
+ * until some elements are used.
+ */
+ svq->next_guest_avail_elem = g_steal_pointer(&elem);
+ }
+
+ /* VQ is full or broken, just return and ignore kicks */
+ return;
+ }
+ /* elem belongs to SVQ or external caller now */
+ elem = NULL;
+ }
+
+ virtio_queue_set_notification(svq->vq, true);
+ } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
+ *
+ * @n: guest kick event notifier, the one that guest set to notify svq.
+ */
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
+{
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
+ event_notifier_test_and_clear(n);
+ vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+ uint16_t *used_idx = &svq->vring.used->idx;
+ if (svq->last_used_idx != svq->shadow_used_idx) {
+ return true;
+ }
+
+ svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
+
+ return svq->last_used_idx != svq->shadow_used_idx;
+}
+
+/**
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+ if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ uint16_t *used_event = (uint16_t *)&svq->vring.avail->ring[svq->vring.num];
+ *used_event = svq->shadow_used_idx;
+ } else {
+ svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+ }
+
+ /* Make sure the event is enabled before the read of used_idx */
+ smp_mb();
+ return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+ /*
+ * No need to disable notification in the event idx case, since used event
+ * index is already an index too far away.
+ */
+ if (!virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+ }
+}
+
+static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq,
+ uint16_t num, uint16_t i)
+{
+ for (uint16_t j = 0; j < (num - 1); ++j) {
+ i = le16_to_cpu(svq->desc_next[i]);
+ }
+
+ return i;
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+ uint32_t *len)
+{
+ const vring_used_t *used = svq->vring.used;
+ vring_used_elem_t used_elem;
+ uint16_t last_used, last_used_chain, num;
+
+ if (!vhost_svq_more_used(svq)) {
+ return NULL;
+ }
+
+ /* Only get used array entries after they have been exposed by dev */
+ smp_rmb();
+ last_used = svq->last_used_idx & (svq->vring.num - 1);
+ used_elem.id = le32_to_cpu(used->ring[last_used].id);
+ used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+ svq->last_used_idx++;
+ if (unlikely(used_elem.id >= svq->vring.num)) {
+ qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+ svq->vdev->name, used_elem.id);
+ return NULL;
+ }
+
+ if (unlikely(!svq->desc_state[used_elem.id].ndescs)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Device %s says index %u is used, but it was not available",
+ svq->vdev->name, used_elem.id);
+ return NULL;
+ }
+
+ num = svq->desc_state[used_elem.id].ndescs;
+ svq->desc_state[used_elem.id].ndescs = 0;
+ last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
+ svq->desc_next[last_used_chain] = svq->free_head;
+ svq->free_head = used_elem.id;
+
+ *len = used_elem.len;
+ return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
+}
+
+/**
+ * Push an element to SVQ, returning it to the guest.
+ */
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+ const VirtQueueElement *elem, uint32_t len)
+{
+ virtqueue_push(svq->vq, elem, len);
+ if (svq->next_guest_avail_elem) {
+ /*
+ * Avail ring was full when vhost_svq_flush was called, so it's a
+ * good moment to make more descriptors available if possible.
+ */
+ vhost_handle_guest_kick(svq);
+ }
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+ bool check_for_avail_queue)
+{
+ VirtQueue *vq = svq->vq;
+
+ /* Forward as many used buffers as possible. */
+ do {
+ unsigned i = 0;
+
+ vhost_svq_disable_notification(svq);
+ while (true) {
+ uint32_t len;
+ g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+ if (!elem) {
+ break;
+ }
+
+ if (unlikely(i >= svq->vring.num)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "More than %u used buffers obtained in a %u size SVQ",
+ i, svq->vring.num);
+ virtqueue_fill(vq, elem, len, i);
+ virtqueue_flush(vq, i);
+ return;
+ }
+ virtqueue_fill(vq, elem, len, i++);
+ }
+
+ virtqueue_flush(vq, i);
+ event_notifier_set(&svq->svq_call);
+
+ if (check_for_avail_queue && svq->next_guest_avail_elem) {
+ /*
+ * Avail ring was full when vhost_svq_flush was called, so it's a
+ * good moment to make more descriptors available if possible.
+ */
+ vhost_handle_guest_kick(svq);
+ }
+ } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Poll the SVQ for one device used buffer.
+ *
+ * This function race with main event loop SVQ polling, so extra
+ * synchronization is needed.
+ *
+ * Return the length written by the device.
+ */
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
+{
+ int64_t start_us = g_get_monotonic_time();
+ uint32_t len;
+
+ do {
+ if (vhost_svq_more_used(svq)) {
+ break;
+ }
+
+ if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
+ return 0;
+ }
+ } while (true);
+
+ vhost_svq_get_buf(svq, &len);
+ return len;
+}
+
+/**
+ * Forward used buffers.
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+ VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+ hdev_call);
+ event_notifier_test_and_clear(n);
+ vhost_svq_flush(svq, true);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+ if (call_fd == VHOST_FILE_UNBIND) {
+ /*
+ * Fail event_notifier_set if called handling device call.
+ *
+ * SVQ still needs device notifications, since it needs to keep
+ * forwarding used buffers even with the unbind.
+ */
+ memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+ } else {
+ event_notifier_init_fd(&svq->svq_call, call_fd);
+ }
+}
+
+/**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+ struct vhost_vring_addr *addr)
+{
+ addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+ addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+ addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+ size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+ size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) +
+ sizeof(uint16_t);
+
+ return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+ size_t used_size = offsetof(vring_used_t, ring[svq->vring.num]) +
+ sizeof(uint16_t);
+ return ROUND_UP(used_size, qemu_real_host_page_size());
+}
+
+/**
+ * Set a new file descriptor for the guest to kick the SVQ and notify for avail
+ *
+ * @svq: The svq
+ * @svq_kick_fd: The svq kick fd
+ *
+ * Note that the SVQ will never close the old file descriptor.
+ */
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
+{
+ EventNotifier *svq_kick = &svq->svq_kick;
+ bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
+ bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
+
+ if (poll_stop) {
+ event_notifier_set_handler(svq_kick, NULL);
+ }
+
+ event_notifier_init_fd(svq_kick, svq_kick_fd);
+ /*
+ * event_notifier_set_handler already checks for guest's notifications if
+ * they arrive at the new file descriptor in the switch, so there is no
+ * need to explicitly check for them.
+ */
+ if (poll_start) {
+ event_notifier_set(svq_kick);
+ event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+ }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+ VirtQueue *vq)
+{
+ size_t desc_size, driver_size, device_size;
+
+ svq->next_guest_avail_elem = NULL;
+ svq->shadow_avail_idx = 0;
+ svq->shadow_used_idx = 0;
+ svq->last_used_idx = 0;
+ svq->vdev = vdev;
+ svq->vq = vq;
+
+ svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+ driver_size = vhost_svq_driver_area_size(svq);
+ device_size = vhost_svq_device_area_size(svq);
+ svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size);
+ desc_size = sizeof(vring_desc_t) * svq->vring.num;
+ svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+ memset(svq->vring.desc, 0, driver_size);
+ svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
+ memset(svq->vring.used, 0, device_size);
+ svq->desc_state = g_new0(SVQDescState, svq->vring.num);
+ svq->desc_next = g_new0(uint16_t, svq->vring.num);
+ for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+ svq->desc_next[i] = cpu_to_le16(i + 1);
+ }
+}
+
+/**
+ * Stop the shadow virtqueue operation.
+ * @svq: Shadow Virtqueue
+ */
+void vhost_svq_stop(VhostShadowVirtqueue *svq)
+{
+ vhost_svq_set_svq_kick_fd(svq, VHOST_FILE_UNBIND);
+ g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+ if (!svq->vq) {
+ return;
+ }
+
+ /* Send all pending used descriptors to guest */
+ vhost_svq_flush(svq, false);
+
+ for (unsigned i = 0; i < svq->vring.num; ++i) {
+ g_autofree VirtQueueElement *elem = NULL;
+ elem = g_steal_pointer(&svq->desc_state[i].elem);
+ if (elem) {
+ virtqueue_detach_element(svq->vq, elem, 0);
+ }
+ }
+
+ next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+ if (next_avail_elem) {
+ virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+ }
+ svq->vq = NULL;
+ g_free(svq->desc_next);
+ g_free(svq->desc_state);
+ qemu_vfree(svq->vring.desc);
+ qemu_vfree(svq->vring.used);
+}
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * @iova_tree: Tree to perform descriptors translations
+ * @ops: SVQ owner callbacks
+ * @ops_opaque: ops opaque pointer
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+ const VhostShadowVirtqueueOps *ops,
+ void *ops_opaque)
+{
+ g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+ int r;
+
+ r = event_notifier_init(&svq->hdev_kick, 0);
+ if (r != 0) {
+ error_report("Couldn't create kick event notifier: %s (%d)",
+ g_strerror(errno), errno);
+ goto err_init_hdev_kick;
+ }
+
+ r = event_notifier_init(&svq->hdev_call, 0);
+ if (r != 0) {
+ error_report("Couldn't create call event notifier: %s (%d)",
+ g_strerror(errno), errno);
+ goto err_init_hdev_call;
+ }
+
+ event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+ event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+ svq->iova_tree = iova_tree;
+ svq->ops = ops;
+ svq->ops_opaque = ops_opaque;
+ return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+ event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+ return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+ VhostShadowVirtqueue *vq = pvq;
+ vhost_svq_stop(vq);
+ event_notifier_cleanup(&vq->hdev_kick);
+ event_notifier_set_handler(&vq->hdev_call, NULL);
+ event_notifier_cleanup(&vq->hdev_call);
+ g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index 00000000..d04c34a5
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -0,0 +1,139 @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
+
+typedef struct SVQDescState {
+ VirtQueueElement *elem;
+
+ /*
+ * Number of descriptors exposed to the device. May or may not match
+ * guest's
+ */
+ unsigned int ndescs;
+} SVQDescState;
+
+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
+/**
+ * Callback to handle an avail buffer.
+ *
+ * @svq: Shadow virtqueue
+ * @elem: Element placed in the queue by the guest
+ * @vq_callback_opaque: Opaque
+ *
+ * Returns 0 if the vq is running as expected.
+ *
+ * Note that ownership of elem is transferred to the callback.
+ */
+typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq,
+ VirtQueueElement *elem,
+ void *vq_callback_opaque);
+
+typedef struct VhostShadowVirtqueueOps {
+ VirtQueueAvailCallback avail_handler;
+} VhostShadowVirtqueueOps;
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+ /* Shadow vring */
+ struct vring vring;
+
+ /* Shadow kick notifier, sent to vhost */
+ EventNotifier hdev_kick;
+ /* Shadow call notifier, sent to vhost */
+ EventNotifier hdev_call;
+
+ /*
+ * Borrowed virtqueue's guest to host notifier. To borrow it in this event
+ * notifier allows to recover the VhostShadowVirtqueue from the event loop
+ * easily. If we use the VirtQueue's one, we don't have an easy way to
+ * retrieve VhostShadowVirtqueue.
+ *
+ * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
+ */
+ EventNotifier svq_kick;
+
+ /* Guest's call notifier, where the SVQ calls guest. */
+ EventNotifier svq_call;
+
+ /* Virtio queue shadowing */
+ VirtQueue *vq;
+
+ /* Virtio device */
+ VirtIODevice *vdev;
+
+ /* IOVA mapping */
+ VhostIOVATree *iova_tree;
+
+ /* SVQ vring descriptors state */
+ SVQDescState *desc_state;
+
+ /* Next VirtQueue element that guest made available */
+ VirtQueueElement *next_guest_avail_elem;
+
+ /*
+ * Backup next field for each descriptor so we can recover securely, not
+ * needing to trust the device access.
+ */
+ uint16_t *desc_next;
+
+ /* Caller callbacks */
+ const VhostShadowVirtqueueOps *ops;
+
+ /* Caller callbacks opaque */
+ void *ops_opaque;
+
+ /* Next head to expose to the device */
+ uint16_t shadow_avail_idx;
+
+ /* Next free descriptor */
+ uint16_t free_head;
+
+ /* Last seen used idx */
+ uint16_t shadow_used_idx;
+
+ /* Next head to consume from the device */
+ uint16_t last_used_idx;
+} VhostShadowVirtqueue;
+
+bool vhost_svq_valid_features(uint64_t features, Error **errp);
+
+void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
+ const VirtQueueElement *elem, uint32_t len);
+int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
+ size_t out_num, const struct iovec *in_sg, size_t in_num,
+ VirtQueueElement *elem);
+size_t vhost_svq_poll(VhostShadowVirtqueue *svq);
+
+void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+ struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
+
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+ VirtQueue *vq);
+void vhost_svq_stop(VhostShadowVirtqueue *svq);
+
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
+ const VhostShadowVirtqueueOps *ops,
+ void *ops_opaque);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
diff --git a/hw/virtio/vhost-stub.c b/hw/virtio/vhost-stub.c
new file mode 100644
index 00000000..c175148f
--- /dev/null
+++ b/hw/virtio/vhost-stub.c
@@ -0,0 +1,17 @@
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user.h"
+
+bool vhost_has_free_slot(void)
+{
+ return true;
+}
+
+bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
+{
+ return false;
+}
+
+void vhost_user_cleanup(VhostUserState *user)
+{
+}
diff --git a/hw/virtio/vhost-user-blk-pci.c b/hw/virtio/vhost-user-blk-pci.c
new file mode 100644
index 00000000..eef8641a
--- /dev/null
+++ b/hw/virtio/vhost-user-blk-pci.c
@@ -0,0 +1,109 @@
+/*
+ * Vhost user blk PCI Bindings
+ *
+ * Copyright(C) 2017 Intel Corporation.
+ *
+ * Authors:
+ * Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * Largely based on the "vhost-user-scsi.c" and "vhost-scsi.c" implemented by:
+ * Felipe Franciosi <felipe@nutanix.com>
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Nicholas Bellinger <nab@risingtidesystems.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "standard-headers/linux/virtio_pci.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/vhost-user-blk.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostUserBlkPCI VHostUserBlkPCI;
+
+/*
+ * vhost-user-blk-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostUserBlkPCI, VHOST_USER_BLK_PCI,
+ TYPE_VHOST_USER_BLK_PCI)
+
+struct VHostUserBlkPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserBlk vdev;
+};
+
+static Property vhost_user_blk_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ if (dev->vdev.num_queues == VHOST_USER_BLK_AUTO_NUM_QUEUES) {
+ dev->vdev.num_queues = virtio_pci_optimal_num_queues(0);
+ }
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = dev->vdev.num_queues + 1;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, vhost_user_blk_pci_properties);
+ k->realize = vhost_user_blk_pci_realize;
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_user_blk_pci_instance_init(Object *obj)
+{
+ VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_BLK);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_info = {
+ .base_name = TYPE_VHOST_USER_BLK_PCI,
+ .generic_name = "vhost-user-blk-pci",
+ .transitional_name = "vhost-user-blk-pci-transitional",
+ .non_transitional_name = "vhost-user-blk-pci-non-transitional",
+ .instance_size = sizeof(VHostUserBlkPCI),
+ .instance_init = vhost_user_blk_pci_instance_init,
+ .class_init = vhost_user_blk_pci_class_init,
+};
+
+static void vhost_user_blk_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_blk_pci_info);
+}
+
+type_init(vhost_user_blk_pci_register)
diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c
new file mode 100644
index 00000000..6829b8b7
--- /dev/null
+++ b/hw/virtio/vhost-user-fs-pci.c
@@ -0,0 +1,88 @@
+/*
+ * Vhost-user filesystem virtio device PCI glue
+ *
+ * Copyright 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ * Dr. David Alan Gilbert <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-fs.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+struct VHostUserFSPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserFS vdev;
+};
+
+typedef struct VHostUserFSPCI VHostUserFSPCI;
+
+#define TYPE_VHOST_USER_FS_PCI "vhost-user-fs-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserFSPCI, VHOST_USER_FS_PCI,
+ TYPE_VHOST_USER_FS_PCI)
+
+static Property vhost_user_fs_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserFSPCI *dev = VHOST_USER_FS_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ /* Also reserve config change and hiprio queue vectors */
+ vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 2;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_fs_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_fs_pci_realize;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, vhost_user_fs_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_OTHER;
+}
+
+static void vhost_user_fs_pci_instance_init(Object *obj)
+{
+ VHostUserFSPCI *dev = VHOST_USER_FS_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_FS);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_fs_pci_info = {
+ .base_name = TYPE_VHOST_USER_FS_PCI,
+ .non_transitional_name = "vhost-user-fs-pci",
+ .instance_size = sizeof(VHostUserFSPCI),
+ .instance_init = vhost_user_fs_pci_instance_init,
+ .class_init = vhost_user_fs_pci_class_init,
+};
+
+static void vhost_user_fs_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_fs_pci_info);
+}
+
+type_init(vhost_user_fs_pci_register);
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
new file mode 100644
index 00000000..d97b179e
--- /dev/null
+++ b/hw/virtio/vhost-user-fs.c
@@ -0,0 +1,336 @@
+/*
+ * Vhost-user filesystem virtio device
+ *
+ * Copyright 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include "standard-headers/linux/virtio_fs.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user-fs.h"
+#include "monitor/monitor.h"
+#include "sysemu/sysemu.h"
+
+static const int user_feature_bits[] = {
+ VIRTIO_F_VERSION_1,
+ VIRTIO_RING_F_INDIRECT_DESC,
+ VIRTIO_RING_F_EVENT_IDX,
+ VIRTIO_F_NOTIFY_ON_EMPTY,
+ VIRTIO_F_RING_PACKED,
+ VIRTIO_F_IOMMU_PLATFORM,
+ VIRTIO_F_RING_RESET,
+
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+ struct virtio_fs_config fscfg = {};
+
+ memcpy((char *)fscfg.tag, fs->conf.tag,
+ MIN(strlen(fs->conf.tag) + 1, sizeof(fscfg.tag)));
+
+ virtio_stl_p(vdev, &fscfg.num_request_queues, fs->conf.num_request_queues);
+
+ memcpy(config, &fscfg, sizeof(fscfg));
+}
+
+static void vuf_start(VirtIODevice *vdev)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+ int i;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return;
+ }
+
+ ret = vhost_dev_enable_notifiers(&fs->vhost_dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", -ret);
+ return;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", -ret);
+ goto err_host_notifiers;
+ }
+
+ fs->vhost_dev.acked_features = vdev->guest_features;
+ ret = vhost_dev_start(&fs->vhost_dev, vdev, true);
+ if (ret < 0) {
+ error_report("Error starting vhost: %d", -ret);
+ goto err_guest_notifiers;
+ }
+
+ /*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+ for (i = 0; i < fs->vhost_dev.nvqs; i++) {
+ vhost_virtqueue_mask(&fs->vhost_dev, vdev, i, false);
+ }
+
+ return;
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&fs->vhost_dev, vdev);
+}
+
+static void vuf_stop(VirtIODevice *vdev)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(&fs->vhost_dev, vdev, true);
+
+ ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(&fs->vhost_dev, vdev);
+}
+
+static void vuf_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+
+ if (vhost_dev_is_started(&fs->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ vuf_start(vdev);
+ } else {
+ vuf_stop(vdev);
+ }
+}
+
+static uint64_t vuf_get_features(VirtIODevice *vdev,
+ uint64_t features,
+ Error **errp)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+
+ return vhost_get_features(&fs->vhost_dev, user_feature_bits, features);
+}
+
+static void vuf_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ /*
+ * Not normally called; it's the daemon that handles the queue;
+ * however virtio's cleanup path can call this.
+ */
+}
+
+static void vuf_guest_notifier_mask(VirtIODevice *vdev, int idx,
+ bool mask)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+
+ vhost_virtqueue_mask(&fs->vhost_dev, vdev, idx, mask);
+}
+
+static bool vuf_guest_notifier_pending(VirtIODevice *vdev, int idx)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+
+ return vhost_virtqueue_pending(&fs->vhost_dev, idx);
+}
+
+static void vuf_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserFS *fs = VHOST_USER_FS(dev);
+ unsigned int i;
+ size_t len;
+ int ret;
+
+ if (!fs->conf.chardev.chr) {
+ error_setg(errp, "missing chardev");
+ return;
+ }
+
+ if (!fs->conf.tag) {
+ error_setg(errp, "missing tag property");
+ return;
+ }
+ len = strlen(fs->conf.tag);
+ if (len == 0) {
+ error_setg(errp, "tag property cannot be empty");
+ return;
+ }
+ if (len > sizeof_field(struct virtio_fs_config, tag)) {
+ error_setg(errp, "tag property must be %zu bytes or less",
+ sizeof_field(struct virtio_fs_config, tag));
+ return;
+ }
+
+ if (fs->conf.num_request_queues == 0) {
+ error_setg(errp, "num-request-queues property must be larger than 0");
+ return;
+ }
+
+ if (!is_power_of_2(fs->conf.queue_size)) {
+ error_setg(errp, "queue-size property must be a power of 2");
+ return;
+ }
+
+ if (fs->conf.queue_size > VIRTQUEUE_MAX_SIZE) {
+ error_setg(errp, "queue-size property must be %u or smaller",
+ VIRTQUEUE_MAX_SIZE);
+ return;
+ }
+
+ if (!vhost_user_init(&fs->vhost_user, &fs->conf.chardev, errp)) {
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_FS, sizeof(struct virtio_fs_config));
+
+ /* Hiprio queue */
+ fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
+
+ /* Request queues */
+ fs->req_vqs = g_new(VirtQueue *, fs->conf.num_request_queues);
+ for (i = 0; i < fs->conf.num_request_queues; i++) {
+ fs->req_vqs[i] = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output);
+ }
+
+ /* 1 high prio queue, plus the number configured */
+ fs->vhost_dev.nvqs = 1 + fs->conf.num_request_queues;
+ fs->vhost_dev.vqs = g_new0(struct vhost_virtqueue, fs->vhost_dev.nvqs);
+ ret = vhost_dev_init(&fs->vhost_dev, &fs->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, errp);
+ if (ret < 0) {
+ goto err_virtio;
+ }
+
+ return;
+
+err_virtio:
+ vhost_user_cleanup(&fs->vhost_user);
+ virtio_delete_queue(fs->hiprio_vq);
+ for (i = 0; i < fs->conf.num_request_queues; i++) {
+ virtio_delete_queue(fs->req_vqs[i]);
+ }
+ g_free(fs->req_vqs);
+ virtio_cleanup(vdev);
+ g_free(fs->vhost_dev.vqs);
+ return;
+}
+
+static void vuf_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserFS *fs = VHOST_USER_FS(dev);
+ int i;
+
+ /* This will stop vhost backend if appropriate. */
+ vuf_set_status(vdev, 0);
+
+ vhost_dev_cleanup(&fs->vhost_dev);
+
+ vhost_user_cleanup(&fs->vhost_user);
+
+ virtio_delete_queue(fs->hiprio_vq);
+ for (i = 0; i < fs->conf.num_request_queues; i++) {
+ virtio_delete_queue(fs->req_vqs[i]);
+ }
+ g_free(fs->req_vqs);
+ virtio_cleanup(vdev);
+ g_free(fs->vhost_dev.vqs);
+ fs->vhost_dev.vqs = NULL;
+}
+
+static struct vhost_dev *vuf_get_vhost(VirtIODevice *vdev)
+{
+ VHostUserFS *fs = VHOST_USER_FS(vdev);
+ return &fs->vhost_dev;
+}
+
+static const VMStateDescription vuf_vmstate = {
+ .name = "vhost-user-fs",
+ .unmigratable = 1,
+};
+
+static Property vuf_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserFS, conf.chardev),
+ DEFINE_PROP_STRING("tag", VHostUserFS, conf.tag),
+ DEFINE_PROP_UINT16("num-request-queues", VHostUserFS,
+ conf.num_request_queues, 1),
+ DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vuf_instance_init(Object *obj)
+{
+ VHostUserFS *fs = VHOST_USER_FS(obj);
+
+ device_add_bootindex_property(obj, &fs->bootindex, "bootindex",
+ "/filesystem@0", DEVICE(obj));
+}
+
+static void vuf_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vuf_properties);
+ dc->vmsd = &vuf_vmstate;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ vdc->realize = vuf_device_realize;
+ vdc->unrealize = vuf_device_unrealize;
+ vdc->get_features = vuf_get_features;
+ vdc->get_config = vuf_get_config;
+ vdc->set_status = vuf_set_status;
+ vdc->guest_notifier_mask = vuf_guest_notifier_mask;
+ vdc->guest_notifier_pending = vuf_guest_notifier_pending;
+ vdc->get_vhost = vuf_get_vhost;
+}
+
+static const TypeInfo vuf_info = {
+ .name = TYPE_VHOST_USER_FS,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostUserFS),
+ .instance_init = vuf_instance_init,
+ .class_init = vuf_class_init,
+};
+
+static void vuf_register_types(void)
+{
+ type_register_static(&vuf_info);
+}
+
+type_init(vuf_register_types)
diff --git a/hw/virtio/vhost-user-gpio-pci.c b/hw/virtio/vhost-user-gpio-pci.c
new file mode 100644
index 00000000..b3028a24
--- /dev/null
+++ b/hw/virtio/vhost-user-gpio-pci.c
@@ -0,0 +1,69 @@
+/*
+ * Vhost-user gpio virtio device PCI glue
+ *
+ * Copyright (c) 2022 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-gpio.h"
+#include "hw/virtio/virtio-pci.h"
+
+struct VHostUserGPIOPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserGPIO vdev;
+};
+
+typedef struct VHostUserGPIOPCI VHostUserGPIOPCI;
+
+#define TYPE_VHOST_USER_GPIO_PCI "vhost-user-gpio-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserGPIOPCI, VHOST_USER_GPIO_PCI,
+ TYPE_VHOST_USER_GPIO_PCI)
+
+static void vhost_user_gpio_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserGPIOPCI *dev = VHOST_USER_GPIO_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ vpci_dev->nvectors = 1;
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_gpio_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_gpio_pci_realize;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_user_gpio_pci_instance_init(Object *obj)
+{
+ VHostUserGPIOPCI *dev = VHOST_USER_GPIO_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_GPIO);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_gpio_pci_info = {
+ .base_name = TYPE_VHOST_USER_GPIO_PCI,
+ .non_transitional_name = "vhost-user-gpio-pci",
+ .instance_size = sizeof(VHostUserGPIOPCI),
+ .instance_init = vhost_user_gpio_pci_instance_init,
+ .class_init = vhost_user_gpio_pci_class_init,
+};
+
+static void vhost_user_gpio_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_gpio_pci_info);
+}
+
+type_init(vhost_user_gpio_pci_register);
diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
new file mode 100644
index 00000000..b7b82a10
--- /dev/null
+++ b/hw/virtio/vhost-user-gpio.c
@@ -0,0 +1,418 @@
+/*
+ * Vhost-user GPIO virtio device
+ *
+ * Copyright (c) 2022 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-gpio.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "trace.h"
+
+#define REALIZE_CONNECTION_RETRIES 3
+
+/* Features required from VirtIO */
+static const int feature_bits[] = {
+ VIRTIO_F_VERSION_1,
+ VIRTIO_F_NOTIFY_ON_EMPTY,
+ VIRTIO_RING_F_INDIRECT_DESC,
+ VIRTIO_RING_F_EVENT_IDX,
+ VIRTIO_GPIO_F_IRQ,
+ VIRTIO_F_RING_RESET,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vu_gpio_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+
+ memcpy(config, &gpio->config, sizeof(gpio->config));
+}
+
+static int vu_gpio_config_notifier(struct vhost_dev *dev)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(dev->vdev);
+
+ memcpy(dev->vdev->config, &gpio->config, sizeof(gpio->config));
+ virtio_notify_config(dev->vdev);
+
+ return 0;
+}
+
+const VhostDevConfigOps gpio_ops = {
+ .vhost_dev_config_notifier = vu_gpio_config_notifier,
+};
+
+static int vu_gpio_start(VirtIODevice *vdev)
+{
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+ struct vhost_dev *vhost_dev = &gpio->vhost_dev;
+ int ret, i;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return -ENOSYS;
+ }
+
+ ret = vhost_dev_enable_notifiers(vhost_dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", ret);
+ return ret;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", ret);
+ goto err_host_notifiers;
+ }
+
+ /*
+ * Before we start up we need to ensure we have the final feature
+ * set needed for the vhost configuration. The backend may also
+ * apply backend_features when the feature set is sent.
+ */
+ vhost_ack_features(&gpio->vhost_dev, feature_bits, vdev->guest_features);
+
+ ret = vhost_dev_start(&gpio->vhost_dev, vdev, false);
+ if (ret < 0) {
+ error_report("Error starting vhost-user-gpio: %d", ret);
+ goto err_guest_notifiers;
+ }
+ gpio->started_vu = true;
+
+ /*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+ for (i = 0; i < gpio->vhost_dev.nvqs; i++) {
+ vhost_virtqueue_mask(&gpio->vhost_dev, vdev, i, false);
+ }
+
+ /*
+ * As we must have VHOST_USER_F_PROTOCOL_FEATURES (because
+ * VHOST_USER_GET_CONFIG requires it) we need to explicitly enable
+ * the vrings.
+ */
+ g_assert(vhost_dev->vhost_ops &&
+ vhost_dev->vhost_ops->vhost_set_vring_enable);
+ ret = vhost_dev->vhost_ops->vhost_set_vring_enable(vhost_dev, true);
+ if (ret == 0) {
+ return 0;
+ }
+
+ error_report("Failed to start vrings for vhost-user-gpio: %d", ret);
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, gpio->vhost_dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&gpio->vhost_dev, vdev);
+
+ return ret;
+}
+
+static void vu_gpio_stop(VirtIODevice *vdev)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ struct vhost_dev *vhost_dev = &gpio->vhost_dev;
+ int ret;
+
+ if (!gpio->started_vu) {
+ return;
+ }
+ gpio->started_vu = false;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(vhost_dev, vdev, false);
+
+ ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(vhost_dev, vdev);
+}
+
+static void vu_gpio_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+
+ trace_virtio_gpio_set_status(status);
+
+ if (!gpio->connected) {
+ return;
+ }
+
+ if (vhost_dev_is_started(&gpio->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ if (vu_gpio_start(vdev)) {
+ qemu_chr_fe_disconnect(&gpio->chardev);
+ }
+ } else {
+ vu_gpio_stop(vdev);
+ }
+}
+
+static uint64_t vu_gpio_get_features(VirtIODevice *vdev, uint64_t features,
+ Error **errp)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+
+ return vhost_get_features(&gpio->vhost_dev, feature_bits, features);
+}
+
+static void vu_gpio_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ /*
+ * Not normally called; it's the daemon that handles the queue;
+ * however virtio's cleanup path can call this.
+ */
+}
+
+static void vu_gpio_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask)
+{
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+
+ vhost_virtqueue_mask(&gpio->vhost_dev, vdev, idx, mask);
+}
+
+static void do_vhost_user_cleanup(VirtIODevice *vdev, VHostUserGPIO *gpio)
+{
+ virtio_delete_queue(gpio->command_vq);
+ virtio_delete_queue(gpio->interrupt_vq);
+ g_free(gpio->vhost_dev.vqs);
+ gpio->vhost_dev.vqs = NULL;
+ virtio_cleanup(vdev);
+ vhost_user_cleanup(&gpio->vhost_user);
+}
+
+static int vu_gpio_connect(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+ struct vhost_dev *vhost_dev = &gpio->vhost_dev;
+ int ret;
+
+ if (gpio->connected) {
+ return 0;
+ }
+ gpio->connected = true;
+
+ vhost_dev_set_config_notifier(vhost_dev, &gpio_ops);
+ gpio->vhost_user.supports_config = true;
+
+ ret = vhost_dev_init(vhost_dev, &gpio->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* restore vhost state */
+ if (virtio_device_started(vdev, vdev->status)) {
+ vu_gpio_start(vdev);
+ }
+
+ return 0;
+}
+
+static void vu_gpio_event(void *opaque, QEMUChrEvent event);
+
+static void vu_gpio_disconnect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+
+ if (!gpio->connected) {
+ return;
+ }
+ gpio->connected = false;
+
+ vu_gpio_stop(vdev);
+ vhost_dev_cleanup(&gpio->vhost_dev);
+
+ /* Re-instate the event handler for new connections */
+ qemu_chr_fe_set_handlers(&gpio->chardev,
+ NULL, NULL, vu_gpio_event,
+ NULL, dev, NULL, true);
+}
+
+static void vu_gpio_event(void *opaque, QEMUChrEvent event)
+{
+ DeviceState *dev = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
+ Error *local_err = NULL;
+
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ if (vu_gpio_connect(dev, &local_err) < 0) {
+ qemu_chr_fe_disconnect(&gpio->chardev);
+ return;
+ }
+ break;
+ case CHR_EVENT_CLOSED:
+ /* defer close until later to avoid circular close */
+ vhost_user_async_close(dev, &gpio->chardev, &gpio->vhost_dev,
+ vu_gpio_disconnect);
+ break;
+ case CHR_EVENT_BREAK:
+ case CHR_EVENT_MUX_IN:
+ case CHR_EVENT_MUX_OUT:
+ /* Ignore */
+ break;
+ }
+}
+
+static int vu_gpio_realize_connect(VHostUserGPIO *gpio, Error **errp)
+{
+ VirtIODevice *vdev = &gpio->parent_obj;
+ DeviceState *dev = &vdev->parent_obj;
+ struct vhost_dev *vhost_dev = &gpio->vhost_dev;
+ int ret;
+
+ ret = qemu_chr_fe_wait_connected(&gpio->chardev, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /*
+ * vu_gpio_connect() may have already connected (via the event
+ * callback) in which case it will just report success.
+ */
+ ret = vu_gpio_connect(dev, errp);
+ if (ret < 0) {
+ qemu_chr_fe_disconnect(&gpio->chardev);
+ return ret;
+ }
+ g_assert(gpio->connected);
+
+ ret = vhost_dev_get_config(vhost_dev, (uint8_t *)&gpio->config,
+ sizeof(gpio->config), errp);
+
+ if (ret < 0) {
+ error_report("vhost-user-gpio: get config failed");
+
+ qemu_chr_fe_disconnect(&gpio->chardev);
+ vhost_dev_cleanup(vhost_dev);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void vu_gpio_device_realize(DeviceState *dev, Error **errp)
+{
+ ERRP_GUARD();
+
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(dev);
+ int retries, ret;
+
+ if (!gpio->chardev.chr) {
+ error_setg(errp, "vhost-user-gpio: chardev is mandatory");
+ return;
+ }
+
+ if (!vhost_user_init(&gpio->vhost_user, &gpio->chardev, errp)) {
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_GPIO, sizeof(gpio->config));
+
+ gpio->vhost_dev.nvqs = 2;
+ gpio->command_vq = virtio_add_queue(vdev, 256, vu_gpio_handle_output);
+ gpio->interrupt_vq = virtio_add_queue(vdev, 256, vu_gpio_handle_output);
+ gpio->vhost_dev.vqs = g_new0(struct vhost_virtqueue, gpio->vhost_dev.nvqs);
+
+ gpio->connected = false;
+
+ qemu_chr_fe_set_handlers(&gpio->chardev, NULL, NULL, vu_gpio_event, NULL,
+ dev, NULL, true);
+
+ retries = REALIZE_CONNECTION_RETRIES;
+ g_assert(!*errp);
+ do {
+ if (*errp) {
+ error_prepend(errp, "Reconnecting after error: ");
+ error_report_err(*errp);
+ *errp = NULL;
+ }
+ ret = vu_gpio_realize_connect(gpio, errp);
+ } while (ret < 0 && retries--);
+
+ if (ret < 0) {
+ do_vhost_user_cleanup(vdev, gpio);
+ }
+
+ return;
+}
+
+static void vu_gpio_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserGPIO *gpio = VHOST_USER_GPIO(dev);
+
+ vu_gpio_set_status(vdev, 0);
+ qemu_chr_fe_set_handlers(&gpio->chardev, NULL, NULL, NULL, NULL, NULL, NULL,
+ false);
+ vhost_dev_cleanup(&gpio->vhost_dev);
+ do_vhost_user_cleanup(vdev, gpio);
+}
+
+static const VMStateDescription vu_gpio_vmstate = {
+ .name = "vhost-user-gpio",
+ .unmigratable = 1,
+};
+
+static Property vu_gpio_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserGPIO, chardev),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vu_gpio_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vu_gpio_properties);
+ dc->vmsd = &vu_gpio_vmstate;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ vdc->realize = vu_gpio_device_realize;
+ vdc->unrealize = vu_gpio_device_unrealize;
+ vdc->get_features = vu_gpio_get_features;
+ vdc->get_config = vu_gpio_get_config;
+ vdc->set_status = vu_gpio_set_status;
+ vdc->guest_notifier_mask = vu_gpio_guest_notifier_mask;
+}
+
+static const TypeInfo vu_gpio_info = {
+ .name = TYPE_VHOST_USER_GPIO,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostUserGPIO),
+ .class_init = vu_gpio_class_init,
+};
+
+static void vu_gpio_register_types(void)
+{
+ type_register_static(&vu_gpio_info);
+}
+
+type_init(vu_gpio_register_types)
diff --git a/hw/virtio/vhost-user-i2c-pci.c b/hw/virtio/vhost-user-i2c-pci.c
new file mode 100644
index 00000000..00ac1094
--- /dev/null
+++ b/hw/virtio/vhost-user-i2c-pci.c
@@ -0,0 +1,69 @@
+/*
+ * Vhost-user i2c virtio device PCI glue
+ *
+ * Copyright (c) 2021 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-i2c.h"
+#include "hw/virtio/virtio-pci.h"
+
+struct VHostUserI2CPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserI2C vdev;
+};
+
+typedef struct VHostUserI2CPCI VHostUserI2CPCI;
+
+#define TYPE_VHOST_USER_I2C_PCI "vhost-user-i2c-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserI2CPCI, VHOST_USER_I2C_PCI,
+ TYPE_VHOST_USER_I2C_PCI)
+
+static void vhost_user_i2c_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ vpci_dev->nvectors = 1;
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_i2c_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_i2c_pci_realize;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_user_i2c_pci_instance_init(Object *obj)
+{
+ VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_I2C);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_i2c_pci_info = {
+ .base_name = TYPE_VHOST_USER_I2C_PCI,
+ .non_transitional_name = "vhost-user-i2c-pci",
+ .instance_size = sizeof(VHostUserI2CPCI),
+ .instance_init = vhost_user_i2c_pci_instance_init,
+ .class_init = vhost_user_i2c_pci_class_init,
+};
+
+static void vhost_user_i2c_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_i2c_pci_info);
+}
+
+type_init(vhost_user_i2c_pci_register);
diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c
new file mode 100644
index 00000000..dc5c828b
--- /dev/null
+++ b/hw/virtio/vhost-user-i2c.c
@@ -0,0 +1,287 @@
+/*
+ * Vhost-user i2c virtio device
+ *
+ * Copyright (c) 2021 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-i2c.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/virtio_ids.h"
+
+static const int feature_bits[] = {
+ VIRTIO_I2C_F_ZERO_LENGTH_REQUEST,
+ VIRTIO_F_RING_RESET,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vu_i2c_start(VirtIODevice *vdev)
+{
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+ int ret, i;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return;
+ }
+
+ ret = vhost_dev_enable_notifiers(&i2c->vhost_dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", -ret);
+ return;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", -ret);
+ goto err_host_notifiers;
+ }
+
+ i2c->vhost_dev.acked_features = vdev->guest_features;
+
+ ret = vhost_dev_start(&i2c->vhost_dev, vdev, true);
+ if (ret < 0) {
+ error_report("Error starting vhost-user-i2c: %d", -ret);
+ goto err_guest_notifiers;
+ }
+
+ /*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+ for (i = 0; i < i2c->vhost_dev.nvqs; i++) {
+ vhost_virtqueue_mask(&i2c->vhost_dev, vdev, i, false);
+ }
+
+ return;
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev);
+}
+
+static void vu_i2c_stop(VirtIODevice *vdev)
+{
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(&i2c->vhost_dev, vdev, true);
+
+ ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev);
+}
+
+static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+
+ if (vhost_dev_is_started(&i2c->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ vu_i2c_start(vdev);
+ } else {
+ vu_i2c_stop(vdev);
+ }
+}
+
+static uint64_t vu_i2c_get_features(VirtIODevice *vdev,
+ uint64_t requested_features, Error **errp)
+{
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ virtio_add_feature(&requested_features, VIRTIO_I2C_F_ZERO_LENGTH_REQUEST);
+ return vhost_get_features(&i2c->vhost_dev, feature_bits, requested_features);
+}
+
+static void vu_i2c_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ /*
+ * Not normally called; it's the daemon that handles the queue;
+ * however virtio's cleanup path can call this.
+ */
+}
+
+static void vu_i2c_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask)
+{
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ vhost_virtqueue_mask(&i2c->vhost_dev, vdev, idx, mask);
+}
+
+static bool vu_i2c_guest_notifier_pending(VirtIODevice *vdev, int idx)
+{
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ return vhost_virtqueue_pending(&i2c->vhost_dev, idx);
+}
+
+static void do_vhost_user_cleanup(VirtIODevice *vdev, VHostUserI2C *i2c)
+{
+ vhost_user_cleanup(&i2c->vhost_user);
+ virtio_delete_queue(i2c->vq);
+ virtio_cleanup(vdev);
+ g_free(i2c->vhost_dev.vqs);
+ i2c->vhost_dev.vqs = NULL;
+}
+
+static int vu_i2c_connect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ if (i2c->connected) {
+ return 0;
+ }
+ i2c->connected = true;
+
+ /* restore vhost state */
+ if (virtio_device_started(vdev, vdev->status)) {
+ vu_i2c_start(vdev);
+ }
+
+ return 0;
+}
+
+static void vu_i2c_disconnect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ if (!i2c->connected) {
+ return;
+ }
+ i2c->connected = false;
+
+ if (vhost_dev_is_started(&i2c->vhost_dev)) {
+ vu_i2c_stop(vdev);
+ }
+}
+
+static void vu_i2c_event(void *opaque, QEMUChrEvent event)
+{
+ DeviceState *dev = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserI2C *i2c = VHOST_USER_I2C(vdev);
+
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ if (vu_i2c_connect(dev) < 0) {
+ qemu_chr_fe_disconnect(&i2c->chardev);
+ return;
+ }
+ break;
+ case CHR_EVENT_CLOSED:
+ vu_i2c_disconnect(dev);
+ break;
+ case CHR_EVENT_BREAK:
+ case CHR_EVENT_MUX_IN:
+ case CHR_EVENT_MUX_OUT:
+ /* Ignore */
+ break;
+ }
+}
+
+static void vu_i2c_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserI2C *i2c = VHOST_USER_I2C(dev);
+ int ret;
+
+ if (!i2c->chardev.chr) {
+ error_setg(errp, "vhost-user-i2c: missing chardev");
+ return;
+ }
+
+ if (!vhost_user_init(&i2c->vhost_user, &i2c->chardev, errp)) {
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_I2C_ADAPTER, 0);
+
+ i2c->vhost_dev.nvqs = 1;
+ i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output);
+ i2c->vhost_dev.vqs = g_new0(struct vhost_virtqueue, i2c->vhost_dev.nvqs);
+
+ ret = vhost_dev_init(&i2c->vhost_dev, &i2c->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, errp);
+ if (ret < 0) {
+ do_vhost_user_cleanup(vdev, i2c);
+ }
+
+ qemu_chr_fe_set_handlers(&i2c->chardev, NULL, NULL, vu_i2c_event, NULL,
+ dev, NULL, true);
+}
+
+static void vu_i2c_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserI2C *i2c = VHOST_USER_I2C(dev);
+
+ /* This will stop vhost backend if appropriate. */
+ vu_i2c_set_status(vdev, 0);
+ vhost_dev_cleanup(&i2c->vhost_dev);
+ do_vhost_user_cleanup(vdev, i2c);
+}
+
+static const VMStateDescription vu_i2c_vmstate = {
+ .name = "vhost-user-i2c",
+ .unmigratable = 1,
+};
+
+static Property vu_i2c_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserI2C, chardev),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vu_i2c_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vu_i2c_properties);
+ dc->vmsd = &vu_i2c_vmstate;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ vdc->realize = vu_i2c_device_realize;
+ vdc->unrealize = vu_i2c_device_unrealize;
+ vdc->get_features = vu_i2c_get_features;
+ vdc->set_status = vu_i2c_set_status;
+ vdc->guest_notifier_mask = vu_i2c_guest_notifier_mask;
+ vdc->guest_notifier_pending = vu_i2c_guest_notifier_pending;
+}
+
+static const TypeInfo vu_i2c_info = {
+ .name = TYPE_VHOST_USER_I2C,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostUserI2C),
+ .class_init = vu_i2c_class_init,
+};
+
+static void vu_i2c_register_types(void)
+{
+ type_register_static(&vu_i2c_info);
+}
+
+type_init(vu_i2c_register_types)
diff --git a/hw/virtio/vhost-user-input-pci.c b/hw/virtio/vhost-user-input-pci.c
new file mode 100644
index 00000000..b858898a
--- /dev/null
+++ b/hw/virtio/vhost-user-input-pci.c
@@ -0,0 +1,50 @@
+/*
+ * This work is licensed under the terms of the GNU LGPL, version 2 or
+ * later. See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-input.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostUserInputPCI VHostUserInputPCI;
+
+#define TYPE_VHOST_USER_INPUT_PCI "vhost-user-input-pci"
+
+DECLARE_INSTANCE_CHECKER(VHostUserInputPCI, VHOST_USER_INPUT_PCI,
+ TYPE_VHOST_USER_INPUT_PCI)
+
+struct VHostUserInputPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserInput vhi;
+};
+
+static void vhost_user_input_pci_instance_init(Object *obj)
+{
+ VHostUserInputPCI *dev = VHOST_USER_INPUT_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vhi, sizeof(dev->vhi),
+ TYPE_VHOST_USER_INPUT);
+
+ object_property_add_alias(obj, "chardev",
+ OBJECT(&dev->vhi), "chardev");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_input_pci_info = {
+ .generic_name = TYPE_VHOST_USER_INPUT_PCI,
+ .parent = TYPE_VIRTIO_INPUT_PCI,
+ .instance_size = sizeof(VHostUserInputPCI),
+ .instance_init = vhost_user_input_pci_instance_init,
+};
+
+static void vhost_user_input_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_input_pci_info);
+}
+
+type_init(vhost_user_input_pci_register)
diff --git a/hw/virtio/vhost-user-rng-pci.c b/hw/virtio/vhost-user-rng-pci.c
new file mode 100644
index 00000000..f6493545
--- /dev/null
+++ b/hw/virtio/vhost-user-rng-pci.c
@@ -0,0 +1,79 @@
+/*
+ * Vhost-user RNG virtio device PCI glue
+ *
+ * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-rng.h"
+#include "hw/virtio/virtio-pci.h"
+
+struct VHostUserRNGPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserRNG vdev;
+};
+
+typedef struct VHostUserRNGPCI VHostUserRNGPCI;
+
+#define TYPE_VHOST_USER_RNG_PCI "vhost-user-rng-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserRNGPCI, VHOST_USER_RNG_PCI,
+ TYPE_VHOST_USER_RNG_PCI)
+
+static Property vhost_user_rng_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = 1;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_rng_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_rng_pci_realize;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ device_class_set_props(dc, vhost_user_rng_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void vhost_user_rng_pci_instance_init(Object *obj)
+{
+ VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_RNG);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_rng_pci_info = {
+ .base_name = TYPE_VHOST_USER_RNG_PCI,
+ .non_transitional_name = "vhost-user-rng-pci",
+ .instance_size = sizeof(VHostUserRNGPCI),
+ .instance_init = vhost_user_rng_pci_instance_init,
+ .class_init = vhost_user_rng_pci_class_init,
+};
+
+static void vhost_user_rng_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_rng_pci_info);
+}
+
+type_init(vhost_user_rng_pci_register);
diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
new file mode 100644
index 00000000..201a39e2
--- /dev/null
+++ b/hw/virtio/vhost-user-rng.c
@@ -0,0 +1,299 @@
+/*
+ * Vhost-user RNG virtio device
+ *
+ * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * Implementation seriously tailored on vhost-user-i2c.c
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-rng.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/virtio_ids.h"
+
+static const int feature_bits[] = {
+ VIRTIO_F_RING_RESET,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vu_rng_start(VirtIODevice *vdev)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+ int i;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return;
+ }
+
+ ret = vhost_dev_enable_notifiers(&rng->vhost_dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", -ret);
+ return;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", -ret);
+ goto err_host_notifiers;
+ }
+
+ rng->vhost_dev.acked_features = vdev->guest_features;
+ ret = vhost_dev_start(&rng->vhost_dev, vdev, true);
+ if (ret < 0) {
+ error_report("Error starting vhost-user-rng: %d", -ret);
+ goto err_guest_notifiers;
+ }
+
+ /*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+ for (i = 0; i < rng->vhost_dev.nvqs; i++) {
+ vhost_virtqueue_mask(&rng->vhost_dev, vdev, i, false);
+ }
+
+ return;
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&rng->vhost_dev, vdev);
+}
+
+static void vu_rng_stop(VirtIODevice *vdev)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(&rng->vhost_dev, vdev, true);
+
+ ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(&rng->vhost_dev, vdev);
+}
+
+static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+
+ if (vhost_dev_is_started(&rng->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ vu_rng_start(vdev);
+ } else {
+ vu_rng_stop(vdev);
+ }
+}
+
+static uint64_t vu_rng_get_features(VirtIODevice *vdev,
+ uint64_t requested_features, Error **errp)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+
+ return vhost_get_features(&rng->vhost_dev, feature_bits,
+ requested_features);
+}
+
+static void vu_rng_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ /*
+ * Not normally called; it's the daemon that handles the queue;
+ * however virtio's cleanup path can call this.
+ */
+}
+
+static void vu_rng_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+
+ vhost_virtqueue_mask(&rng->vhost_dev, vdev, idx, mask);
+}
+
+static bool vu_rng_guest_notifier_pending(VirtIODevice *vdev, int idx)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+
+ return vhost_virtqueue_pending(&rng->vhost_dev, idx);
+}
+
+static void vu_rng_connect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+
+ if (rng->connected) {
+ return;
+ }
+
+ rng->connected = true;
+
+ /* restore vhost state */
+ if (virtio_device_started(vdev, vdev->status)) {
+ vu_rng_start(vdev);
+ }
+}
+
+static void vu_rng_disconnect(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+
+ if (!rng->connected) {
+ return;
+ }
+
+ rng->connected = false;
+
+ if (vhost_dev_is_started(&rng->vhost_dev)) {
+ vu_rng_stop(vdev);
+ }
+}
+
+static void vu_rng_event(void *opaque, QEMUChrEvent event)
+{
+ DeviceState *dev = opaque;
+
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ vu_rng_connect(dev);
+ break;
+ case CHR_EVENT_CLOSED:
+ vu_rng_disconnect(dev);
+ break;
+ case CHR_EVENT_BREAK:
+ case CHR_EVENT_MUX_IN:
+ case CHR_EVENT_MUX_OUT:
+ /* Ignore */
+ break;
+ }
+}
+
+static void vu_rng_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRNG *rng = VHOST_USER_RNG(dev);
+ int ret;
+
+ if (!rng->chardev.chr) {
+ error_setg(errp, "missing chardev");
+ return;
+ }
+
+ if (!vhost_user_init(&rng->vhost_user, &rng->chardev, errp)) {
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_RNG, 0);
+
+ rng->req_vq = virtio_add_queue(vdev, 4, vu_rng_handle_output);
+ if (!rng->req_vq) {
+ error_setg_errno(errp, -1, "virtio_add_queue() failed");
+ goto virtio_add_queue_failed;
+ }
+
+ rng->vhost_dev.nvqs = 1;
+ rng->vhost_dev.vqs = g_new0(struct vhost_virtqueue, rng->vhost_dev.nvqs);
+ ret = vhost_dev_init(&rng->vhost_dev, &rng->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, errp);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vhost_dev_init() failed");
+ goto vhost_dev_init_failed;
+ }
+
+ qemu_chr_fe_set_handlers(&rng->chardev, NULL, NULL, vu_rng_event, NULL,
+ dev, NULL, true);
+
+ return;
+
+vhost_dev_init_failed:
+ virtio_delete_queue(rng->req_vq);
+virtio_add_queue_failed:
+ virtio_cleanup(vdev);
+ vhost_user_cleanup(&rng->vhost_user);
+}
+
+static void vu_rng_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserRNG *rng = VHOST_USER_RNG(dev);
+
+ vu_rng_set_status(vdev, 0);
+
+ vhost_dev_cleanup(&rng->vhost_dev);
+ g_free(rng->vhost_dev.vqs);
+ rng->vhost_dev.vqs = NULL;
+ virtio_delete_queue(rng->req_vq);
+ virtio_cleanup(vdev);
+ vhost_user_cleanup(&rng->vhost_user);
+}
+
+static struct vhost_dev *vu_rng_get_vhost(VirtIODevice *vdev)
+{
+ VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+ return &rng->vhost_dev;
+}
+
+static const VMStateDescription vu_rng_vmstate = {
+ .name = "vhost-user-rng",
+ .unmigratable = 1,
+};
+
+static Property vu_rng_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserRNG, chardev),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vu_rng_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vu_rng_properties);
+ dc->vmsd = &vu_rng_vmstate;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+
+ vdc->realize = vu_rng_device_realize;
+ vdc->unrealize = vu_rng_device_unrealize;
+ vdc->get_features = vu_rng_get_features;
+ vdc->set_status = vu_rng_set_status;
+ vdc->guest_notifier_mask = vu_rng_guest_notifier_mask;
+ vdc->guest_notifier_pending = vu_rng_guest_notifier_pending;
+ vdc->get_vhost = vu_rng_get_vhost;
+}
+
+static const TypeInfo vu_rng_info = {
+ .name = TYPE_VHOST_USER_RNG,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostUserRNG),
+ .class_init = vu_rng_class_init,
+};
+
+static void vu_rng_register_types(void)
+{
+ type_register_static(&vu_rng_info);
+}
+
+type_init(vu_rng_register_types)
diff --git a/hw/virtio/vhost-user-scsi-pci.c b/hw/virtio/vhost-user-scsi-pci.c
new file mode 100644
index 00000000..75882e3c
--- /dev/null
+++ b/hw/virtio/vhost-user-scsi-pci.c
@@ -0,0 +1,110 @@
+/*
+ * Vhost user scsi PCI Bindings
+ *
+ * Copyright (c) 2016 Nutanix Inc. All rights reserved.
+ *
+ * Author:
+ * Felipe Franciosi <felipe@nutanix.com>
+ *
+ * This work is largely based on the "vhost-scsi" implementation by:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Nicholas Bellinger <nab@risingtidesystems.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "standard-headers/linux/virtio_pci.h"
+#include "hw/virtio/vhost-user-scsi.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-scsi.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/loader.h"
+#include "sysemu/kvm.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VHostUserSCSIPCI VHostUserSCSIPCI;
+
+#define TYPE_VHOST_USER_SCSI_PCI "vhost-user-scsi-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostUserSCSIPCI, VHOST_USER_SCSI_PCI,
+ TYPE_VHOST_USER_SCSI_PCI)
+
+struct VHostUserSCSIPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserSCSI vdev;
+};
+
+static Property vhost_user_scsi_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf;
+
+ if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+ conf->num_queues =
+ virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED);
+ }
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_scsi_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_scsi_pci_realize;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, vhost_user_scsi_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI;
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_user_scsi_pci_instance_init(Object *obj)
+{
+ VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_SCSI);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_scsi_pci_info = {
+ .base_name = TYPE_VHOST_USER_SCSI_PCI,
+ .generic_name = "vhost-user-scsi-pci",
+ .transitional_name = "vhost-user-scsi-pci-transitional",
+ .non_transitional_name = "vhost-user-scsi-pci-non-transitional",
+ .instance_size = sizeof(VHostUserSCSIPCI),
+ .instance_init = vhost_user_scsi_pci_instance_init,
+ .class_init = vhost_user_scsi_pci_class_init,
+};
+
+static void vhost_user_scsi_pci_register(void)
+{
+ virtio_pci_types_register(&vhost_user_scsi_pci_info);
+}
+
+type_init(vhost_user_scsi_pci_register)
diff --git a/hw/virtio/vhost-user-vsock-pci.c b/hw/virtio/vhost-user-vsock-pci.c
new file mode 100644
index 00000000..e5a86e80
--- /dev/null
+++ b/hw/virtio/vhost-user-vsock-pci.c
@@ -0,0 +1,86 @@
+/*
+ * Vhost-user vsock PCI Bindings
+ *
+ * Copyright 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-vsock.h"
+#include "qom/object.h"
+
+typedef struct VHostUserVSockPCI VHostUserVSockPCI;
+
+/*
+ * vhost-user-vsock-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_USER_VSOCK_PCI "vhost-user-vsock-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostUserVSockPCI, VHOST_USER_VSOCK_PCI,
+ TYPE_VHOST_USER_VSOCK_PCI)
+
+struct VHostUserVSockPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostUserVSock vdev;
+};
+
+/* vhost-user-vsock-pci */
+
+static Property vhost_user_vsock_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ /* unlike vhost-vsock, we do not need to care about pre-5.1 compat */
+ virtio_pci_force_virtio_1(vpci_dev);
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_user_vsock_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_user_vsock_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, vhost_user_vsock_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK;
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_user_vsock_pci_instance_init(Object *obj)
+{
+ VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_USER_VSOCK);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_vsock_pci_info = {
+ .base_name = TYPE_VHOST_USER_VSOCK_PCI,
+ .generic_name = "vhost-user-vsock-pci",
+ .non_transitional_name = "vhost-user-vsock-pci-non-transitional",
+ .instance_size = sizeof(VHostUserVSockPCI),
+ .instance_init = vhost_user_vsock_pci_instance_init,
+ .class_init = vhost_user_vsock_pci_class_init,
+};
+
+static void virtio_pci_vhost_register(void)
+{
+ virtio_pci_types_register(&vhost_user_vsock_pci_info);
+}
+
+type_init(virtio_pci_vhost_register)
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
new file mode 100644
index 00000000..9431b979
--- /dev/null
+++ b/hw/virtio/vhost-user-vsock.c
@@ -0,0 +1,180 @@
+/*
+ * Vhost-user vsock virtio device
+ *
+ * Copyright 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/virtio/vhost-user-vsock.h"
+
+static const int user_feature_bits[] = {
+ VIRTIO_F_VERSION_1,
+ VIRTIO_RING_F_INDIRECT_DESC,
+ VIRTIO_RING_F_EVENT_IDX,
+ VIRTIO_F_NOTIFY_ON_EMPTY,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+static void vuv_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VHostUserVSock *vsock = VHOST_USER_VSOCK(vdev);
+
+ memcpy(config, &vsock->vsockcfg, sizeof(struct virtio_vsock_config));
+}
+
+static int vuv_handle_config_change(struct vhost_dev *dev)
+{
+ VHostUserVSock *vsock = VHOST_USER_VSOCK(dev->vdev);
+ Error *local_err = NULL;
+ int ret = vhost_dev_get_config(dev, (uint8_t *)&vsock->vsockcfg,
+ sizeof(struct virtio_vsock_config),
+ &local_err);
+ if (ret < 0) {
+ error_report_err(local_err);
+ return -1;
+ }
+
+ virtio_notify_config(dev->vdev);
+
+ return 0;
+}
+
+const VhostDevConfigOps vsock_ops = {
+ .vhost_dev_config_notifier = vuv_handle_config_change,
+};
+
+static void vuv_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+
+ if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ int ret = vhost_vsock_common_start(vdev);
+ if (ret < 0) {
+ return;
+ }
+ } else {
+ vhost_vsock_common_stop(vdev);
+ }
+}
+
+static uint64_t vuv_get_features(VirtIODevice *vdev,
+ uint64_t features,
+ Error **errp)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ features = vhost_get_features(&vvc->vhost_dev, user_feature_bits, features);
+
+ return vhost_vsock_common_get_features(vdev, features, errp);
+}
+
+static const VMStateDescription vuv_vmstate = {
+ .name = "vhost-user-vsock",
+ .unmigratable = 1,
+};
+
+static void vuv_device_realize(DeviceState *dev, Error **errp)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserVSock *vsock = VHOST_USER_VSOCK(dev);
+ int ret;
+
+ if (!vsock->conf.chardev.chr) {
+ error_setg(errp, "missing chardev");
+ return;
+ }
+
+ if (!vhost_user_init(&vsock->vhost_user, &vsock->conf.chardev, errp)) {
+ return;
+ }
+
+ vhost_vsock_common_realize(vdev);
+
+ vhost_dev_set_config_notifier(&vvc->vhost_dev, &vsock_ops);
+
+ ret = vhost_dev_init(&vvc->vhost_dev, &vsock->vhost_user,
+ VHOST_BACKEND_TYPE_USER, 0, errp);
+ if (ret < 0) {
+ goto err_virtio;
+ }
+
+ ret = vhost_dev_get_config(&vvc->vhost_dev, (uint8_t *)&vsock->vsockcfg,
+ sizeof(struct virtio_vsock_config), errp);
+ if (ret < 0) {
+ goto err_vhost_dev;
+ }
+
+ return;
+
+err_vhost_dev:
+ vhost_dev_cleanup(&vvc->vhost_dev);
+err_virtio:
+ vhost_vsock_common_unrealize(vdev);
+ vhost_user_cleanup(&vsock->vhost_user);
+ return;
+}
+
+static void vuv_device_unrealize(DeviceState *dev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostUserVSock *vsock = VHOST_USER_VSOCK(dev);
+
+ /* This will stop vhost backend if appropriate. */
+ vuv_set_status(vdev, 0);
+
+ vhost_dev_cleanup(&vvc->vhost_dev);
+
+ vhost_vsock_common_unrealize(vdev);
+
+ vhost_user_cleanup(&vsock->vhost_user);
+
+}
+
+static Property vuv_properties[] = {
+ DEFINE_PROP_CHR("chardev", VHostUserVSock, conf.chardev),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vuv_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vuv_properties);
+ dc->vmsd = &vuv_vmstate;
+ vdc->realize = vuv_device_realize;
+ vdc->unrealize = vuv_device_unrealize;
+ vdc->get_features = vuv_get_features;
+ vdc->get_config = vuv_get_config;
+ vdc->set_status = vuv_set_status;
+}
+
+static const TypeInfo vuv_info = {
+ .name = TYPE_VHOST_USER_VSOCK,
+ .parent = TYPE_VHOST_VSOCK_COMMON,
+ .instance_size = sizeof(VHostUserVSock),
+ .class_init = vuv_class_init,
+};
+
+static void vuv_register_types(void)
+{
+ type_register_static(&vuv_info);
+}
+
+type_init(vuv_register_types)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
new file mode 100644
index 00000000..8f635844
--- /dev/null
+++ b/hw/virtio/vhost-user.c
@@ -0,0 +1,2800 @@
+/*
+ * vhost-user
+ *
+ * Copyright (c) 2013 Virtual Open Systems Sarl.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user.h"
+#include "hw/virtio/vhost-backend.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-net.h"
+#include "chardev/char-fe.h"
+#include "io/channel-socket.h"
+#include "sysemu/kvm.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qemu/sockets.h"
+#include "sysemu/runstate.h"
+#include "sysemu/cryptodev.h"
+#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
+#include "trace.h"
+#include "exec/ramblock.h"
+
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "standard-headers/linux/vhost_types.h"
+
+#ifdef CONFIG_LINUX
+#include <linux/userfaultfd.h>
+#endif
+
+#define VHOST_MEMORY_BASELINE_NREGIONS 8
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+#define VHOST_USER_SLAVE_MAX_FDS 8
+
+/*
+ * Set maximum number of RAM slots supported to
+ * the maximum number supported by the target
+ * hardware plaform.
+ */
+#if defined(TARGET_X86) || defined(TARGET_X86_64) || \
+ defined(TARGET_ARM) || defined(TARGET_ARM_64)
+#include "hw/acpi/acpi.h"
+#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS
+
+#elif defined(TARGET_PPC) || defined(TARGET_PPC64)
+#include "hw/ppc/spapr.h"
+#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
+
+#else
+#define VHOST_USER_MAX_RAM_SLOTS 512
+#endif
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+enum VhostUserProtocolFeature {
+ VHOST_USER_PROTOCOL_F_MQ = 0,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
+ VHOST_USER_PROTOCOL_F_RARP = 2,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
+ VHOST_USER_PROTOCOL_F_NET_MTU = 4,
+ VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5,
+ VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
+ VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
+ VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
+ VHOST_USER_PROTOCOL_F_CONFIG = 9,
+ VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
+ VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+ VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
+ /* Feature 14 reserved for VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. */
+ VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
+ VHOST_USER_PROTOCOL_F_STATUS = 16,
+ VHOST_USER_PROTOCOL_F_MAX
+};
+
+#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_NET_SET_MTU = 20,
+ VHOST_USER_SET_SLAVE_REQ_FD = 21,
+ VHOST_USER_IOTLB_MSG = 22,
+ VHOST_USER_SET_VRING_ENDIAN = 23,
+ VHOST_USER_GET_CONFIG = 24,
+ VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_CREATE_CRYPTO_SESSION = 26,
+ VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
+ VHOST_USER_POSTCOPY_ADVISE = 28,
+ VHOST_USER_POSTCOPY_LISTEN = 29,
+ VHOST_USER_POSTCOPY_END = 30,
+ VHOST_USER_GET_INFLIGHT_FD = 31,
+ VHOST_USER_SET_INFLIGHT_FD = 32,
+ VHOST_USER_GPU_SET_SOCKET = 33,
+ VHOST_USER_RESET_DEVICE = 34,
+ /* Message number 35 reserved for VHOST_USER_VRING_KICK. */
+ VHOST_USER_GET_MAX_MEM_SLOTS = 36,
+ VHOST_USER_ADD_MEM_REG = 37,
+ VHOST_USER_REM_MEM_REG = 38,
+ VHOST_USER_SET_STATUS = 39,
+ VHOST_USER_GET_STATUS = 40,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+ VHOST_USER_SLAVE_NONE = 0,
+ VHOST_USER_SLAVE_IOTLB_MSG = 1,
+ VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+ VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserMemRegMsg {
+ uint64_t padding;
+ VhostUserMemoryRegion region;
+} VhostUserMemRegMsg;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserConfig {
+ uint32_t offset;
+ uint32_t size;
+ uint32_t flags;
+ uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+#define VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN 512
+#define VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN 64
+
+typedef struct VhostUserCryptoSession {
+ /* session id for success, -1 on errors */
+ int64_t session_id;
+ CryptoDevBackendSymSessionInfo session_setup_data;
+ uint8_t key[VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN];
+ uint8_t auth_key[VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN];
+} VhostUserCryptoSession;
+
+static VhostUserConfig c __attribute__ ((unused));
+#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \
+ + sizeof(c.size) \
+ + sizeof(c.flags))
+
+typedef struct VhostUserVringArea {
+ uint64_t u64;
+ uint64_t size;
+ uint64_t offset;
+} VhostUserVringArea;
+
+typedef struct VhostUserInflight {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint16_t num_queues;
+ uint16_t queue_size;
+} VhostUserInflight;
+
+typedef struct {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK (0x3)
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+} QEMU_PACKED VhostUserHeader;
+
+typedef union {
+#define VHOST_USER_VRING_IDX_MASK (0xff)
+#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserMemRegMsg mem_reg;
+ VhostUserLog log;
+ struct vhost_iotlb_msg iotlb;
+ VhostUserConfig config;
+ VhostUserCryptoSession session;
+ VhostUserVringArea area;
+ VhostUserInflight inflight;
+} VhostUserPayload;
+
+typedef struct VhostUserMsg {
+ VhostUserHeader hdr;
+ VhostUserPayload payload;
+} QEMU_PACKED VhostUserMsg;
+
+static VhostUserMsg m __attribute__ ((unused));
+#define VHOST_USER_HDR_SIZE (sizeof(VhostUserHeader))
+
+#define VHOST_USER_PAYLOAD_SIZE (sizeof(VhostUserPayload))
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION (0x1)
+
+struct vhost_user {
+ struct vhost_dev *dev;
+ /* Shared between vhost devs of the same virtio device */
+ VhostUserState *user;
+ QIOChannel *slave_ioc;
+ GSource *slave_src;
+ NotifierWithReturn postcopy_notifier;
+ struct PostCopyFD postcopy_fd;
+ uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS];
+ /* Length of the region_rb and region_rb_offset arrays */
+ size_t region_rb_len;
+ /* RAMBlock associated with a given region */
+ RAMBlock **region_rb;
+ /*
+ * The offset from the start of the RAMBlock to the start of the
+ * vhost region.
+ */
+ ram_addr_t *region_rb_offset;
+
+ /* True once we've entered postcopy_listen */
+ bool postcopy_listen;
+
+ /* Our current regions */
+ int num_shadow_regions;
+ struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS];
+};
+
+struct scrub_regions {
+ struct vhost_memory_region *region;
+ int reg_idx;
+ int fd_idx;
+};
+
+static bool ioeventfd_enabled(void)
+{
+ return !kvm_enabled() || kvm_eventfds_enabled();
+}
+
+static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg)
+{
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ uint8_t *p = (uint8_t *) msg;
+ int r, size = VHOST_USER_HDR_SIZE;
+
+ r = qemu_chr_fe_read_all(chr, p, size);
+ if (r != size) {
+ int saved_errno = errno;
+ error_report("Failed to read msg header. Read %d instead of %d."
+ " Original request %d.", r, size, msg->hdr.request);
+ return r < 0 ? -saved_errno : -EIO;
+ }
+
+ /* validate received flags */
+ if (msg->hdr.flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
+ error_report("Failed to read msg header."
+ " Flags 0x%x instead of 0x%x.", msg->hdr.flags,
+ VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
+ return -EPROTO;
+ }
+
+ trace_vhost_user_read(msg->hdr.request, msg->hdr.flags);
+
+ return 0;
+}
+
+struct vhost_user_read_cb_data {
+ struct vhost_dev *dev;
+ VhostUserMsg *msg;
+ GMainLoop *loop;
+ int ret;
+};
+
+static gboolean vhost_user_read_cb(void *do_not_use, GIOCondition condition,
+ gpointer opaque)
+{
+ struct vhost_user_read_cb_data *data = opaque;
+ struct vhost_dev *dev = data->dev;
+ VhostUserMsg *msg = data->msg;
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ uint8_t *p = (uint8_t *) msg;
+ int r, size;
+
+ r = vhost_user_read_header(dev, msg);
+ if (r < 0) {
+ data->ret = r;
+ goto end;
+ }
+
+ /* validate message size is sane */
+ if (msg->hdr.size > VHOST_USER_PAYLOAD_SIZE) {
+ error_report("Failed to read msg header."
+ " Size %d exceeds the maximum %zu.", msg->hdr.size,
+ VHOST_USER_PAYLOAD_SIZE);
+ data->ret = -EPROTO;
+ goto end;
+ }
+
+ if (msg->hdr.size) {
+ p += VHOST_USER_HDR_SIZE;
+ size = msg->hdr.size;
+ r = qemu_chr_fe_read_all(chr, p, size);
+ if (r != size) {
+ int saved_errno = errno;
+ error_report("Failed to read msg payload."
+ " Read %d instead of %d.", r, msg->hdr.size);
+ data->ret = r < 0 ? -saved_errno : -EIO;
+ goto end;
+ }
+ }
+
+end:
+ g_main_loop_quit(data->loop);
+ return G_SOURCE_REMOVE;
+}
+
+static gboolean slave_read(QIOChannel *ioc, GIOCondition condition,
+ gpointer opaque);
+
+/*
+ * This updates the read handler to use a new event loop context.
+ * Event sources are removed from the previous context : this ensures
+ * that events detected in the previous context are purged. They will
+ * be re-detected and processed in the new context.
+ */
+static void slave_update_read_handler(struct vhost_dev *dev,
+ GMainContext *ctxt)
+{
+ struct vhost_user *u = dev->opaque;
+
+ if (!u->slave_ioc) {
+ return;
+ }
+
+ if (u->slave_src) {
+ g_source_destroy(u->slave_src);
+ g_source_unref(u->slave_src);
+ }
+
+ u->slave_src = qio_channel_add_watch_source(u->slave_ioc,
+ G_IO_IN | G_IO_HUP,
+ slave_read, dev, NULL,
+ ctxt);
+}
+
+static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
+{
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ GMainContext *prev_ctxt = chr->chr->gcontext;
+ GMainContext *ctxt = g_main_context_new();
+ GMainLoop *loop = g_main_loop_new(ctxt, FALSE);
+ struct vhost_user_read_cb_data data = {
+ .dev = dev,
+ .loop = loop,
+ .msg = msg,
+ .ret = 0
+ };
+
+ /*
+ * We want to be able to monitor the slave channel fd while waiting
+ * for chr I/O. This requires an event loop, but we can't nest the
+ * one to which chr is currently attached : its fd handlers might not
+ * be prepared for re-entrancy. So we create a new one and switch chr
+ * to use it.
+ */
+ slave_update_read_handler(dev, ctxt);
+ qemu_chr_be_update_read_handlers(chr->chr, ctxt);
+ qemu_chr_fe_add_watch(chr, G_IO_IN | G_IO_HUP, vhost_user_read_cb, &data);
+
+ g_main_loop_run(loop);
+
+ /*
+ * Restore the previous event loop context. This also destroys/recreates
+ * event sources : this guarantees that all pending events in the original
+ * context that have been processed by the nested loop are purged.
+ */
+ qemu_chr_be_update_read_handlers(chr->chr, prev_ctxt);
+ slave_update_read_handler(dev, NULL);
+
+ g_main_loop_unref(loop);
+ g_main_context_unref(ctxt);
+
+ return data.ret;
+}
+
+static int process_message_reply(struct vhost_dev *dev,
+ const VhostUserMsg *msg)
+{
+ int ret;
+ VhostUserMsg msg_reply;
+
+ if ((msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
+ return 0;
+ }
+
+ ret = vhost_user_read(dev, &msg_reply);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg_reply.hdr.request != msg->hdr.request) {
+ error_report("Received unexpected msg type. "
+ "Expected %d received %d",
+ msg->hdr.request, msg_reply.hdr.request);
+ return -EPROTO;
+ }
+
+ return msg_reply.payload.u64 ? -EIO : 0;
+}
+
+static bool vhost_user_one_time_request(VhostUserRequest request)
+{
+ switch (request) {
+ case VHOST_USER_SET_OWNER:
+ case VHOST_USER_RESET_OWNER:
+ case VHOST_USER_SET_MEM_TABLE:
+ case VHOST_USER_GET_QUEUE_NUM:
+ case VHOST_USER_NET_SET_MTU:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* most non-init callers ignore the error */
+static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
+ int *fds, int fd_num)
+{
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ int ret, size = VHOST_USER_HDR_SIZE + msg->hdr.size;
+
+ /*
+ * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE,
+ * we just need send it once in the first time. For later such
+ * request, we just ignore it.
+ */
+ if (vhost_user_one_time_request(msg->hdr.request) && dev->vq_index != 0) {
+ msg->hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
+ return 0;
+ }
+
+ if (qemu_chr_fe_set_msgfds(chr, fds, fd_num) < 0) {
+ error_report("Failed to set msg fds.");
+ return -EINVAL;
+ }
+
+ ret = qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size);
+ if (ret != size) {
+ int saved_errno = errno;
+ error_report("Failed to write msg."
+ " Wrote %d instead of %d.", ret, size);
+ return ret < 0 ? -saved_errno : -EIO;
+ }
+
+ trace_vhost_user_write(msg->hdr.request, msg->hdr.flags);
+
+ return 0;
+}
+
+int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd)
+{
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_GPU_SET_SOCKET,
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ return vhost_user_write(dev, &msg, &fd, 1);
+}
+
+static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
+ struct vhost_log *log)
+{
+ int fds[VHOST_USER_MAX_RAM_SLOTS];
+ size_t fd_num = 0;
+ bool shmfd = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_LOG_BASE,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.log.mmap_size = log->size * sizeof(*(log->log)),
+ .payload.log.mmap_offset = 0,
+ .hdr.size = sizeof(msg.payload.log),
+ };
+
+ if (shmfd && log->fd != -1) {
+ fds[fd_num++] = log->fd;
+ }
+
+ ret = vhost_user_write(dev, &msg, fds, fd_num);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (shmfd) {
+ msg.hdr.size = 0;
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_SET_LOG_BASE) {
+ error_report("Received unexpected msg type. "
+ "Expected %d received %d",
+ VHOST_USER_SET_LOG_BASE, msg.hdr.request);
+ return -EPROTO;
+ }
+ }
+
+ return 0;
+}
+
+static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset,
+ int *fd)
+{
+ MemoryRegion *mr;
+
+ assert((uintptr_t)addr == addr);
+ mr = memory_region_from_host((void *)(uintptr_t)addr, offset);
+ *fd = memory_region_get_fd(mr);
+
+ return mr;
+}
+
+static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
+ struct vhost_memory_region *src,
+ uint64_t mmap_offset)
+{
+ assert(src != NULL && dst != NULL);
+ dst->userspace_addr = src->userspace_addr;
+ dst->memory_size = src->memory_size;
+ dst->guest_phys_addr = src->guest_phys_addr;
+ dst->mmap_offset = mmap_offset;
+}
+
+static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
+ struct vhost_dev *dev,
+ VhostUserMsg *msg,
+ int *fds, size_t *fd_num,
+ bool track_ramblocks)
+{
+ int i, fd;
+ ram_addr_t offset;
+ MemoryRegion *mr;
+ struct vhost_memory_region *reg;
+ VhostUserMemoryRegion region_buffer;
+
+ msg->hdr.request = VHOST_USER_SET_MEM_TABLE;
+
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ reg = dev->mem->regions + i;
+
+ mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+ if (fd > 0) {
+ if (track_ramblocks) {
+ assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS);
+ trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name,
+ reg->memory_size,
+ reg->guest_phys_addr,
+ reg->userspace_addr,
+ offset);
+ u->region_rb_offset[i] = offset;
+ u->region_rb[i] = mr->ram_block;
+ } else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) {
+ error_report("Failed preparing vhost-user memory table msg");
+ return -ENOBUFS;
+ }
+ vhost_user_fill_msg_region(&region_buffer, reg, offset);
+ msg->payload.memory.regions[*fd_num] = region_buffer;
+ fds[(*fd_num)++] = fd;
+ } else if (track_ramblocks) {
+ u->region_rb_offset[i] = 0;
+ u->region_rb[i] = NULL;
+ }
+ }
+
+ msg->payload.memory.nregions = *fd_num;
+
+ if (!*fd_num) {
+ error_report("Failed initializing vhost-user memory map, "
+ "consider using -object memory-backend-file share=on");
+ return -EINVAL;
+ }
+
+ msg->hdr.size = sizeof(msg->payload.memory.nregions);
+ msg->hdr.size += sizeof(msg->payload.memory.padding);
+ msg->hdr.size += *fd_num * sizeof(VhostUserMemoryRegion);
+
+ return 0;
+}
+
+static inline bool reg_equal(struct vhost_memory_region *shadow_reg,
+ struct vhost_memory_region *vdev_reg)
+{
+ return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr &&
+ shadow_reg->userspace_addr == vdev_reg->userspace_addr &&
+ shadow_reg->memory_size == vdev_reg->memory_size;
+}
+
+static void scrub_shadow_regions(struct vhost_dev *dev,
+ struct scrub_regions *add_reg,
+ int *nr_add_reg,
+ struct scrub_regions *rem_reg,
+ int *nr_rem_reg, uint64_t *shadow_pcb,
+ bool track_ramblocks)
+{
+ struct vhost_user *u = dev->opaque;
+ bool found[VHOST_USER_MAX_RAM_SLOTS] = {};
+ struct vhost_memory_region *reg, *shadow_reg;
+ int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0;
+ ram_addr_t offset;
+ MemoryRegion *mr;
+ bool matching;
+
+ /*
+ * Find memory regions present in our shadow state which are not in
+ * the device's current memory state.
+ *
+ * Mark regions in both the shadow and device state as "found".
+ */
+ for (i = 0; i < u->num_shadow_regions; i++) {
+ shadow_reg = &u->shadow_regions[i];
+ matching = false;
+
+ for (j = 0; j < dev->mem->nregions; j++) {
+ reg = &dev->mem->regions[j];
+
+ mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+
+ if (reg_equal(shadow_reg, reg)) {
+ matching = true;
+ found[j] = true;
+ if (track_ramblocks) {
+ /*
+ * Reset postcopy client bases, region_rb, and
+ * region_rb_offset in case regions are removed.
+ */
+ if (fd > 0) {
+ u->region_rb_offset[j] = offset;
+ u->region_rb[j] = mr->ram_block;
+ shadow_pcb[j] = u->postcopy_client_bases[i];
+ } else {
+ u->region_rb_offset[j] = 0;
+ u->region_rb[j] = NULL;
+ }
+ }
+ break;
+ }
+ }
+
+ /*
+ * If the region was not found in the current device memory state
+ * create an entry for it in the removed list.
+ */
+ if (!matching) {
+ rem_reg[rm_idx].region = shadow_reg;
+ rem_reg[rm_idx++].reg_idx = i;
+ }
+ }
+
+ /*
+ * For regions not marked "found", create entries in the added list.
+ *
+ * Note their indexes in the device memory state and the indexes of their
+ * file descriptors.
+ */
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+ if (fd > 0) {
+ ++fd_num;
+ }
+
+ /*
+ * If the region was in both the shadow and device state we don't
+ * need to send a VHOST_USER_ADD_MEM_REG message for it.
+ */
+ if (found[i]) {
+ continue;
+ }
+
+ add_reg[add_idx].region = reg;
+ add_reg[add_idx].reg_idx = i;
+ add_reg[add_idx++].fd_idx = fd_num;
+ }
+ *nr_rem_reg = rm_idx;
+ *nr_add_reg = add_idx;
+
+ return;
+}
+
+static int send_remove_regions(struct vhost_dev *dev,
+ struct scrub_regions *remove_reg,
+ int nr_rem_reg, VhostUserMsg *msg,
+ bool reply_supported)
+{
+ struct vhost_user *u = dev->opaque;
+ struct vhost_memory_region *shadow_reg;
+ int i, fd, shadow_reg_idx, ret;
+ ram_addr_t offset;
+ VhostUserMemoryRegion region_buffer;
+
+ /*
+ * The regions in remove_reg appear in the same order they do in the
+ * shadow table. Therefore we can minimize memory copies by iterating
+ * through remove_reg backwards.
+ */
+ for (i = nr_rem_reg - 1; i >= 0; i--) {
+ shadow_reg = remove_reg[i].region;
+ shadow_reg_idx = remove_reg[i].reg_idx;
+
+ vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd);
+
+ if (fd > 0) {
+ msg->hdr.request = VHOST_USER_REM_MEM_REG;
+ vhost_user_fill_msg_region(&region_buffer, shadow_reg, 0);
+ msg->payload.mem_reg.region = region_buffer;
+
+ ret = vhost_user_write(dev, msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (reply_supported) {
+ ret = process_message_reply(dev, msg);
+ if (ret) {
+ return ret;
+ }
+ }
+ }
+
+ /*
+ * At this point we know the backend has unmapped the region. It is now
+ * safe to remove it from the shadow table.
+ */
+ memmove(&u->shadow_regions[shadow_reg_idx],
+ &u->shadow_regions[shadow_reg_idx + 1],
+ sizeof(struct vhost_memory_region) *
+ (u->num_shadow_regions - shadow_reg_idx - 1));
+ u->num_shadow_regions--;
+ }
+
+ return 0;
+}
+
+static int send_add_regions(struct vhost_dev *dev,
+ struct scrub_regions *add_reg, int nr_add_reg,
+ VhostUserMsg *msg, uint64_t *shadow_pcb,
+ bool reply_supported, bool track_ramblocks)
+{
+ struct vhost_user *u = dev->opaque;
+ int i, fd, ret, reg_idx, reg_fd_idx;
+ struct vhost_memory_region *reg;
+ MemoryRegion *mr;
+ ram_addr_t offset;
+ VhostUserMsg msg_reply;
+ VhostUserMemoryRegion region_buffer;
+
+ for (i = 0; i < nr_add_reg; i++) {
+ reg = add_reg[i].region;
+ reg_idx = add_reg[i].reg_idx;
+ reg_fd_idx = add_reg[i].fd_idx;
+
+ mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
+
+ if (fd > 0) {
+ if (track_ramblocks) {
+ trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name,
+ reg->memory_size,
+ reg->guest_phys_addr,
+ reg->userspace_addr,
+ offset);
+ u->region_rb_offset[reg_idx] = offset;
+ u->region_rb[reg_idx] = mr->ram_block;
+ }
+ msg->hdr.request = VHOST_USER_ADD_MEM_REG;
+ vhost_user_fill_msg_region(&region_buffer, reg, offset);
+ msg->payload.mem_reg.region = region_buffer;
+
+ ret = vhost_user_write(dev, msg, &fd, 1);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (track_ramblocks) {
+ uint64_t reply_gpa;
+
+ ret = vhost_user_read(dev, &msg_reply);
+ if (ret < 0) {
+ return ret;
+ }
+
+ reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr;
+
+ if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) {
+ error_report("%s: Received unexpected msg type."
+ "Expected %d received %d", __func__,
+ VHOST_USER_ADD_MEM_REG,
+ msg_reply.hdr.request);
+ return -EPROTO;
+ }
+
+ /*
+ * We're using the same structure, just reusing one of the
+ * fields, so it should be the same size.
+ */
+ if (msg_reply.hdr.size != msg->hdr.size) {
+ error_report("%s: Unexpected size for postcopy reply "
+ "%d vs %d", __func__, msg_reply.hdr.size,
+ msg->hdr.size);
+ return -EPROTO;
+ }
+
+ /* Get the postcopy client base from the backend's reply. */
+ if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) {
+ shadow_pcb[reg_idx] =
+ msg_reply.payload.mem_reg.region.userspace_addr;
+ trace_vhost_user_set_mem_table_postcopy(
+ msg_reply.payload.mem_reg.region.userspace_addr,
+ msg->payload.mem_reg.region.userspace_addr,
+ reg_fd_idx, reg_idx);
+ } else {
+ error_report("%s: invalid postcopy reply for region. "
+ "Got guest physical address %" PRIX64 ", expected "
+ "%" PRIX64, __func__, reply_gpa,
+ dev->mem->regions[reg_idx].guest_phys_addr);
+ return -EPROTO;
+ }
+ } else if (reply_supported) {
+ ret = process_message_reply(dev, msg);
+ if (ret) {
+ return ret;
+ }
+ }
+ } else if (track_ramblocks) {
+ u->region_rb_offset[reg_idx] = 0;
+ u->region_rb[reg_idx] = NULL;
+ }
+
+ /*
+ * At this point, we know the backend has mapped in the new
+ * region, if the region has a valid file descriptor.
+ *
+ * The region should now be added to the shadow table.
+ */
+ u->shadow_regions[u->num_shadow_regions].guest_phys_addr =
+ reg->guest_phys_addr;
+ u->shadow_regions[u->num_shadow_regions].userspace_addr =
+ reg->userspace_addr;
+ u->shadow_regions[u->num_shadow_regions].memory_size =
+ reg->memory_size;
+ u->num_shadow_regions++;
+ }
+
+ return 0;
+}
+
+static int vhost_user_add_remove_regions(struct vhost_dev *dev,
+ VhostUserMsg *msg,
+ bool reply_supported,
+ bool track_ramblocks)
+{
+ struct vhost_user *u = dev->opaque;
+ struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS];
+ struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS];
+ uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {};
+ int nr_add_reg, nr_rem_reg;
+ int ret;
+
+ msg->hdr.size = sizeof(msg->payload.mem_reg);
+
+ /* Find the regions which need to be removed or added. */
+ scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg,
+ shadow_pcb, track_ramblocks);
+
+ if (nr_rem_reg) {
+ ret = send_remove_regions(dev, rem_reg, nr_rem_reg, msg,
+ reply_supported);
+ if (ret < 0) {
+ goto err;
+ }
+ }
+
+ if (nr_add_reg) {
+ ret = send_add_regions(dev, add_reg, nr_add_reg, msg, shadow_pcb,
+ reply_supported, track_ramblocks);
+ if (ret < 0) {
+ goto err;
+ }
+ }
+
+ if (track_ramblocks) {
+ memcpy(u->postcopy_client_bases, shadow_pcb,
+ sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
+ /*
+ * Now we've registered this with the postcopy code, we ack to the
+ * client, because now we're in the position to be able to deal with
+ * any faults it generates.
+ */
+ /* TODO: Use this for failure cases as well with a bad value. */
+ msg->hdr.size = sizeof(msg->payload.u64);
+ msg->payload.u64 = 0; /* OK */
+
+ ret = vhost_user_write(dev, msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+
+err:
+ if (track_ramblocks) {
+ memcpy(u->postcopy_client_bases, shadow_pcb,
+ sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
+ }
+
+ return ret;
+}
+
+static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
+ struct vhost_memory *mem,
+ bool reply_supported,
+ bool config_mem_slots)
+{
+ struct vhost_user *u = dev->opaque;
+ int fds[VHOST_MEMORY_BASELINE_NREGIONS];
+ size_t fd_num = 0;
+ VhostUserMsg msg_reply;
+ int region_i, msg_i;
+ int ret;
+
+ VhostUserMsg msg = {
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ if (u->region_rb_len < dev->mem->nregions) {
+ u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions);
+ u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset,
+ dev->mem->nregions);
+ memset(&(u->region_rb[u->region_rb_len]), '\0',
+ sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len));
+ memset(&(u->region_rb_offset[u->region_rb_len]), '\0',
+ sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len));
+ u->region_rb_len = dev->mem->nregions;
+ }
+
+ if (config_mem_slots) {
+ ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, true);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+ true);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_write(dev, &msg, fds, fd_num);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg_reply);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
+ error_report("%s: Received unexpected msg type."
+ "Expected %d received %d", __func__,
+ VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
+ return -EPROTO;
+ }
+
+ /*
+ * We're using the same structure, just reusing one of the
+ * fields, so it should be the same size.
+ */
+ if (msg_reply.hdr.size != msg.hdr.size) {
+ error_report("%s: Unexpected size for postcopy reply "
+ "%d vs %d", __func__, msg_reply.hdr.size,
+ msg.hdr.size);
+ return -EPROTO;
+ }
+
+ memset(u->postcopy_client_bases, 0,
+ sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
+
+ /*
+ * They're in the same order as the regions that were sent
+ * but some of the regions were skipped (above) if they
+ * didn't have fd's
+ */
+ for (msg_i = 0, region_i = 0;
+ region_i < dev->mem->nregions;
+ region_i++) {
+ if (msg_i < fd_num &&
+ msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
+ dev->mem->regions[region_i].guest_phys_addr) {
+ u->postcopy_client_bases[region_i] =
+ msg_reply.payload.memory.regions[msg_i].userspace_addr;
+ trace_vhost_user_set_mem_table_postcopy(
+ msg_reply.payload.memory.regions[msg_i].userspace_addr,
+ msg.payload.memory.regions[msg_i].userspace_addr,
+ msg_i, region_i);
+ msg_i++;
+ }
+ }
+ if (msg_i != fd_num) {
+ error_report("%s: postcopy reply not fully consumed "
+ "%d vs %zd",
+ __func__, msg_i, fd_num);
+ return -EIO;
+ }
+
+ /*
+ * Now we've registered this with the postcopy code, we ack to the
+ * client, because now we're in the position to be able to deal
+ * with any faults it generates.
+ */
+ /* TODO: Use this for failure cases as well with a bad value. */
+ msg.hdr.size = sizeof(msg.payload.u64);
+ msg.payload.u64 = 0; /* OK */
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_user_set_mem_table(struct vhost_dev *dev,
+ struct vhost_memory *mem)
+{
+ struct vhost_user *u = dev->opaque;
+ int fds[VHOST_MEMORY_BASELINE_NREGIONS];
+ size_t fd_num = 0;
+ bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+ bool config_mem_slots =
+ virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS);
+ int ret;
+
+ if (do_postcopy) {
+ /*
+ * Postcopy has enough differences that it's best done in it's own
+ * version
+ */
+ return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported,
+ config_mem_slots);
+ }
+
+ VhostUserMsg msg = {
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ if (reply_supported) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ if (config_mem_slots) {
+ ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, false);
+ if (ret < 0) {
+ return ret;
+ }
+ } else {
+ ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
+ false);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_write(dev, &msg, fds, fd_num);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (reply_supported) {
+ return process_message_reply(dev, &msg);
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_user_set_vring_endian(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ bool cross_endian = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CROSS_ENDIAN);
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_VRING_ENDIAN,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.state = *ring,
+ .hdr.size = sizeof(msg.payload.state),
+ };
+
+ if (!cross_endian) {
+ error_report("vhost-user trying to send unhandled ioctl");
+ return -ENOTSUP;
+ }
+
+ return vhost_user_write(dev, &msg, NULL, 0);
+}
+
+static int vhost_set_vring(struct vhost_dev *dev,
+ unsigned long int request,
+ struct vhost_vring_state *ring)
+{
+ VhostUserMsg msg = {
+ .hdr.request = request,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.state = *ring,
+ .hdr.size = sizeof(msg.payload.state),
+ };
+
+ return vhost_user_write(dev, &msg, NULL, 0);
+}
+
+static int vhost_user_set_vring_num(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
+}
+
+static void vhost_user_host_notifier_free(VhostUserHostNotifier *n)
+{
+ assert(n && n->unmap_addr);
+ munmap(n->unmap_addr, qemu_real_host_page_size());
+ n->unmap_addr = NULL;
+}
+
+/*
+ * clean-up function for notifier, will finally free the structure
+ * under rcu.
+ */
+static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n,
+ VirtIODevice *vdev)
+{
+ if (n->addr) {
+ if (vdev) {
+ virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false);
+ }
+ assert(!n->unmap_addr);
+ n->unmap_addr = n->addr;
+ n->addr = NULL;
+ call_rcu(n, vhost_user_host_notifier_free, rcu);
+ }
+}
+
+static int vhost_user_set_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
+}
+
+static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+ int i;
+
+ if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ int ret;
+ struct vhost_vring_state state = {
+ .index = dev->vq_index + i,
+ .num = enable,
+ };
+
+ ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state);
+ if (ret < 0) {
+ /*
+ * Restoring the previous state is likely infeasible, as well as
+ * proceeding regardless the error, so just bail out and hope for
+ * the device-level recovery.
+ */
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static VhostUserHostNotifier *fetch_notifier(VhostUserState *u,
+ int idx)
+{
+ if (idx >= u->notifiers->len) {
+ return NULL;
+ }
+ return g_ptr_array_index(u->notifiers, idx);
+}
+
+static int vhost_user_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_GET_VRING_BASE,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.state = *ring,
+ .hdr.size = sizeof(msg.payload.state),
+ };
+ struct vhost_user *u = dev->opaque;
+
+ VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index);
+ if (n) {
+ vhost_user_host_notifier_remove(n, dev->vdev);
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_GET_VRING_BASE) {
+ error_report("Received unexpected msg type. Expected %d received %d",
+ VHOST_USER_GET_VRING_BASE, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size != sizeof(msg.payload.state)) {
+ error_report("Received bad msg size.");
+ return -EPROTO;
+ }
+
+ *ring = msg.payload.state;
+
+ return 0;
+}
+
+static int vhost_set_vring_file(struct vhost_dev *dev,
+ VhostUserRequest request,
+ struct vhost_vring_file *file)
+{
+ int fds[VHOST_USER_MAX_RAM_SLOTS];
+ size_t fd_num = 0;
+ VhostUserMsg msg = {
+ .hdr.request = request,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
+ .hdr.size = sizeof(msg.payload.u64),
+ };
+
+ if (ioeventfd_enabled() && file->fd > 0) {
+ fds[fd_num++] = file->fd;
+ } else {
+ msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+ }
+
+ return vhost_user_write(dev, &msg, fds, fd_num);
+}
+
+static int vhost_user_set_vring_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file);
+}
+
+static int vhost_user_set_vring_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file);
+}
+
+static int vhost_user_set_vring_err(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_ERR, file);
+}
+
+static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
+{
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = request,
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ if (vhost_user_one_time_request(request) && dev->vq_index != 0) {
+ return 0;
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg.hdr.request != request) {
+ error_report("Received unexpected msg type. Expected %d received %d",
+ request, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size != sizeof(msg.payload.u64)) {
+ error_report("Received bad msg size.");
+ return -EPROTO;
+ }
+
+ *u64 = msg.payload.u64;
+
+ return 0;
+}
+
+static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
+{
+ if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) {
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int enforce_reply(struct vhost_dev *dev,
+ const VhostUserMsg *msg)
+{
+ uint64_t dummy;
+
+ if (msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
+ return process_message_reply(dev, msg);
+ }
+
+ /*
+ * We need to wait for a reply but the backend does not
+ * support replies for the command we just sent.
+ * Send VHOST_USER_GET_FEATURES which makes all backends
+ * send a reply.
+ */
+ return vhost_user_get_features(dev, &dummy);
+}
+
+static int vhost_user_set_vring_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_VRING_ADDR,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.addr = *addr,
+ .hdr.size = sizeof(msg.payload.addr),
+ };
+
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+ /*
+ * wait for a reply if logging is enabled to make sure
+ * backend is actually logging changes
+ */
+ bool wait_for_reply = addr->flags & (1 << VHOST_VRING_F_LOG);
+
+ if (reply_supported && wait_for_reply) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (wait_for_reply) {
+ return enforce_reply(dev, &msg);
+ }
+
+ return 0;
+}
+
+static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64,
+ bool wait_for_reply)
+{
+ VhostUserMsg msg = {
+ .hdr.request = request,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.u64 = u64,
+ .hdr.size = sizeof(msg.payload.u64),
+ };
+ int ret;
+
+ if (wait_for_reply) {
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+ if (reply_supported) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (wait_for_reply) {
+ return enforce_reply(dev, &msg);
+ }
+
+ return 0;
+}
+
+static int vhost_user_set_status(struct vhost_dev *dev, uint8_t status)
+{
+ return vhost_user_set_u64(dev, VHOST_USER_SET_STATUS, status, false);
+}
+
+static int vhost_user_get_status(struct vhost_dev *dev, uint8_t *status)
+{
+ uint64_t value;
+ int ret;
+
+ ret = vhost_user_get_u64(dev, VHOST_USER_GET_STATUS, &value);
+ if (ret < 0) {
+ return ret;
+ }
+ *status = value;
+
+ return 0;
+}
+
+static int vhost_user_add_status(struct vhost_dev *dev, uint8_t status)
+{
+ uint8_t s;
+ int ret;
+
+ ret = vhost_user_get_status(dev, &s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if ((s & status) == status) {
+ return 0;
+ }
+ s |= status;
+
+ return vhost_user_set_status(dev, s);
+}
+
+static int vhost_user_set_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ /*
+ * wait for a reply if logging is enabled to make sure
+ * backend is actually logging changes
+ */
+ bool log_enabled = features & (0x1ULL << VHOST_F_LOG_ALL);
+ int ret;
+
+ /*
+ * We need to include any extra backend only feature bits that
+ * might be needed by our device. Currently this includes the
+ * VHOST_USER_F_PROTOCOL_FEATURES bit for enabling protocol
+ * features.
+ */
+ ret = vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES,
+ features | dev->backend_features,
+ log_enabled);
+
+ if (virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_STATUS)) {
+ if (!ret) {
+ return vhost_user_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+ }
+ }
+
+ return ret;
+}
+
+static int vhost_user_set_protocol_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features,
+ false);
+}
+
+static int vhost_user_set_owner(struct vhost_dev *dev)
+{
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_OWNER,
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ return vhost_user_write(dev, &msg, NULL, 0);
+}
+
+static int vhost_user_get_max_memslots(struct vhost_dev *dev,
+ uint64_t *max_memslots)
+{
+ uint64_t backend_max_memslots;
+ int err;
+
+ err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS,
+ &backend_max_memslots);
+ if (err < 0) {
+ return err;
+ }
+
+ *max_memslots = backend_max_memslots;
+
+ return 0;
+}
+
+static int vhost_user_reset_device(struct vhost_dev *dev)
+{
+ VhostUserMsg msg = {
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ msg.hdr.request = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_RESET_DEVICE)
+ ? VHOST_USER_RESET_DEVICE
+ : VHOST_USER_RESET_OWNER;
+
+ return vhost_user_write(dev, &msg, NULL, 0);
+}
+
+static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
+{
+ if (!dev->config_ops || !dev->config_ops->vhost_dev_config_notifier) {
+ return -ENOSYS;
+ }
+
+ return dev->config_ops->vhost_dev_config_notifier(dev);
+}
+
+/*
+ * Fetch or create the notifier for a given idx. Newly created
+ * notifiers are added to the pointer array that tracks them.
+ */
+static VhostUserHostNotifier *fetch_or_create_notifier(VhostUserState *u,
+ int idx)
+{
+ VhostUserHostNotifier *n = NULL;
+ if (idx >= u->notifiers->len) {
+ g_ptr_array_set_size(u->notifiers, idx + 1);
+ }
+
+ n = g_ptr_array_index(u->notifiers, idx);
+ if (!n) {
+ /*
+ * In case notification arrive out-of-order,
+ * make room for current index.
+ */
+ g_ptr_array_remove_index(u->notifiers, idx);
+ n = g_new0(VhostUserHostNotifier, 1);
+ n->idx = idx;
+ g_ptr_array_insert(u->notifiers, idx, n);
+ trace_vhost_user_create_notifier(idx, n);
+ }
+
+ return n;
+}
+
+static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev,
+ VhostUserVringArea *area,
+ int fd)
+{
+ int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
+ size_t page_size = qemu_real_host_page_size();
+ struct vhost_user *u = dev->opaque;
+ VhostUserState *user = u->user;
+ VirtIODevice *vdev = dev->vdev;
+ VhostUserHostNotifier *n;
+ void *addr;
+ char *name;
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
+ vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
+ return -EINVAL;
+ }
+
+ /*
+ * Fetch notifier and invalidate any old data before setting up
+ * new mapped address.
+ */
+ n = fetch_or_create_notifier(user, queue_idx);
+ vhost_user_host_notifier_remove(n, vdev);
+
+ if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
+ return 0;
+ }
+
+ /* Sanity check. */
+ if (area->size != page_size) {
+ return -EINVAL;
+ }
+
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd, area->offset);
+ if (addr == MAP_FAILED) {
+ return -EFAULT;
+ }
+
+ name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
+ user, queue_idx);
+ if (!n->mr.ram) { /* Don't init again after suspend. */
+ memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
+ page_size, addr);
+ } else {
+ n->mr.ram_block->host = addr;
+ }
+ g_free(name);
+
+ if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
+ object_unparent(OBJECT(&n->mr));
+ munmap(addr, page_size);
+ return -ENXIO;
+ }
+
+ n->addr = addr;
+
+ return 0;
+}
+
+static void close_slave_channel(struct vhost_user *u)
+{
+ g_source_destroy(u->slave_src);
+ g_source_unref(u->slave_src);
+ u->slave_src = NULL;
+ object_unref(OBJECT(u->slave_ioc));
+ u->slave_ioc = NULL;
+}
+
+static gboolean slave_read(QIOChannel *ioc, GIOCondition condition,
+ gpointer opaque)
+{
+ struct vhost_dev *dev = opaque;
+ struct vhost_user *u = dev->opaque;
+ VhostUserHeader hdr = { 0, };
+ VhostUserPayload payload = { 0, };
+ Error *local_err = NULL;
+ gboolean rc = G_SOURCE_CONTINUE;
+ int ret = 0;
+ struct iovec iov;
+ g_autofree int *fd = NULL;
+ size_t fdsize = 0;
+ int i;
+
+ /* Read header */
+ iov.iov_base = &hdr;
+ iov.iov_len = VHOST_USER_HDR_SIZE;
+
+ if (qio_channel_readv_full_all(ioc, &iov, 1, &fd, &fdsize, &local_err)) {
+ error_report_err(local_err);
+ goto err;
+ }
+
+ if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
+ error_report("Failed to read msg header."
+ " Size %d exceeds the maximum %zu.", hdr.size,
+ VHOST_USER_PAYLOAD_SIZE);
+ goto err;
+ }
+
+ /* Read payload */
+ if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) {
+ error_report_err(local_err);
+ goto err;
+ }
+
+ switch (hdr.request) {
+ case VHOST_USER_SLAVE_IOTLB_MSG:
+ ret = vhost_backend_handle_iotlb_msg(dev, &payload.iotlb);
+ break;
+ case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
+ ret = vhost_user_slave_handle_config_change(dev);
+ break;
+ case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
+ ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area,
+ fd ? fd[0] : -1);
+ break;
+ default:
+ error_report("Received unexpected msg type: %d.", hdr.request);
+ ret = -EINVAL;
+ }
+
+ /*
+ * REPLY_ACK feature handling. Other reply types has to be managed
+ * directly in their request handlers.
+ */
+ if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
+ struct iovec iovec[2];
+
+
+ hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
+ hdr.flags |= VHOST_USER_REPLY_MASK;
+
+ payload.u64 = !!ret;
+ hdr.size = sizeof(payload.u64);
+
+ iovec[0].iov_base = &hdr;
+ iovec[0].iov_len = VHOST_USER_HDR_SIZE;
+ iovec[1].iov_base = &payload;
+ iovec[1].iov_len = hdr.size;
+
+ if (qio_channel_writev_all(ioc, iovec, ARRAY_SIZE(iovec), &local_err)) {
+ error_report_err(local_err);
+ goto err;
+ }
+ }
+
+ goto fdcleanup;
+
+err:
+ close_slave_channel(u);
+ rc = G_SOURCE_REMOVE;
+
+fdcleanup:
+ if (fd) {
+ for (i = 0; i < fdsize; i++) {
+ close(fd[i]);
+ }
+ }
+ return rc;
+}
+
+static int vhost_setup_slave_channel(struct vhost_dev *dev)
+{
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_SLAVE_REQ_FD,
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+ struct vhost_user *u = dev->opaque;
+ int sv[2], ret = 0;
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+ Error *local_err = NULL;
+ QIOChannel *ioc;
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
+ return 0;
+ }
+
+ if (qemu_socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
+ int saved_errno = errno;
+ error_report("socketpair() failed");
+ return -saved_errno;
+ }
+
+ ioc = QIO_CHANNEL(qio_channel_socket_new_fd(sv[0], &local_err));
+ if (!ioc) {
+ error_report_err(local_err);
+ return -ECONNREFUSED;
+ }
+ u->slave_ioc = ioc;
+ slave_update_read_handler(dev, NULL);
+
+ if (reply_supported) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ ret = vhost_user_write(dev, &msg, &sv[1], 1);
+ if (ret) {
+ goto out;
+ }
+
+ if (reply_supported) {
+ ret = process_message_reply(dev, &msg);
+ }
+
+out:
+ close(sv[1]);
+ if (ret) {
+ close_slave_channel(u);
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_LINUX
+/*
+ * Called back from the postcopy fault thread when a fault is received on our
+ * ufd.
+ * TODO: This is Linux specific
+ */
+static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd,
+ void *ufd)
+{
+ struct vhost_dev *dev = pcfd->data;
+ struct vhost_user *u = dev->opaque;
+ struct uffd_msg *msg = ufd;
+ uint64_t faultaddr = msg->arg.pagefault.address;
+ RAMBlock *rb = NULL;
+ uint64_t rb_offset;
+ int i;
+
+ trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr,
+ dev->mem->nregions);
+ for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
+ trace_vhost_user_postcopy_fault_handler_loop(i,
+ u->postcopy_client_bases[i], dev->mem->regions[i].memory_size);
+ if (faultaddr >= u->postcopy_client_bases[i]) {
+ /* Ofset of the fault address in the vhost region */
+ uint64_t region_offset = faultaddr - u->postcopy_client_bases[i];
+ if (region_offset < dev->mem->regions[i].memory_size) {
+ rb_offset = region_offset + u->region_rb_offset[i];
+ trace_vhost_user_postcopy_fault_handler_found(i,
+ region_offset, rb_offset);
+ rb = u->region_rb[i];
+ return postcopy_request_shared_page(pcfd, rb, faultaddr,
+ rb_offset);
+ }
+ }
+ }
+ error_report("%s: Failed to find region for fault %" PRIx64,
+ __func__, faultaddr);
+ return -1;
+}
+
+static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb,
+ uint64_t offset)
+{
+ struct vhost_dev *dev = pcfd->data;
+ struct vhost_user *u = dev->opaque;
+ int i;
+
+ trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset);
+
+ if (!u) {
+ return 0;
+ }
+ /* Translate the offset into an address in the clients address space */
+ for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
+ if (u->region_rb[i] == rb &&
+ offset >= u->region_rb_offset[i] &&
+ offset < (u->region_rb_offset[i] +
+ dev->mem->regions[i].memory_size)) {
+ uint64_t client_addr = (offset - u->region_rb_offset[i]) +
+ u->postcopy_client_bases[i];
+ trace_vhost_user_postcopy_waker_found(client_addr);
+ return postcopy_wake_shared(pcfd, client_addr, rb);
+ }
+ }
+
+ trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset);
+ return 0;
+}
+#endif
+
+/*
+ * Called at the start of an inbound postcopy on reception of the
+ * 'advise' command.
+ */
+static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
+{
+#ifdef CONFIG_LINUX
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ int ufd;
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_POSTCOPY_ADVISE,
+ .hdr.flags = VHOST_USER_VERSION,
+ };
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_setg(errp, "Failed to send postcopy_advise to vhost");
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ error_setg(errp, "Failed to get postcopy_advise reply from vhost");
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) {
+ error_setg(errp, "Unexpected msg type. Expected %d received %d",
+ VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size) {
+ error_setg(errp, "Received bad msg size.");
+ return -EPROTO;
+ }
+ ufd = qemu_chr_fe_get_msgfd(chr);
+ if (ufd < 0) {
+ error_setg(errp, "%s: Failed to get ufd", __func__);
+ return -EIO;
+ }
+ qemu_socket_set_nonblock(ufd);
+
+ /* register ufd with userfault thread */
+ u->postcopy_fd.fd = ufd;
+ u->postcopy_fd.data = dev;
+ u->postcopy_fd.handler = vhost_user_postcopy_fault_handler;
+ u->postcopy_fd.waker = vhost_user_postcopy_waker;
+ u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */
+ postcopy_register_shared_ufd(&u->postcopy_fd);
+ return 0;
+#else
+ error_setg(errp, "Postcopy not supported on non-Linux systems");
+ return -ENOSYS;
+#endif
+}
+
+/*
+ * Called at the switch to postcopy on reception of the 'listen' command.
+ */
+static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp)
+{
+ struct vhost_user *u = dev->opaque;
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_POSTCOPY_LISTEN,
+ .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
+ };
+ u->postcopy_listen = true;
+
+ trace_vhost_user_postcopy_listen();
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_setg(errp, "Failed to send postcopy_listen to vhost");
+ return ret;
+ }
+
+ ret = process_message_reply(dev, &msg);
+ if (ret) {
+ error_setg(errp, "Failed to receive reply to postcopy_listen");
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * Called at the end of postcopy
+ */
+static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp)
+{
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_POSTCOPY_END,
+ .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
+ };
+ int ret;
+ struct vhost_user *u = dev->opaque;
+
+ trace_vhost_user_postcopy_end_entry();
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_setg(errp, "Failed to send postcopy_end to vhost");
+ return ret;
+ }
+
+ ret = process_message_reply(dev, &msg);
+ if (ret) {
+ error_setg(errp, "Failed to receive reply to postcopy_end");
+ return ret;
+ }
+ postcopy_unregister_shared_ufd(&u->postcopy_fd);
+ close(u->postcopy_fd.fd);
+ u->postcopy_fd.handler = NULL;
+
+ trace_vhost_user_postcopy_end_exit();
+
+ return 0;
+}
+
+static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
+ void *opaque)
+{
+ struct PostcopyNotifyData *pnd = opaque;
+ struct vhost_user *u = container_of(notifier, struct vhost_user,
+ postcopy_notifier);
+ struct vhost_dev *dev = u->dev;
+
+ switch (pnd->reason) {
+ case POSTCOPY_NOTIFY_PROBE:
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_PAGEFAULT)) {
+ /* TODO: Get the device name into this error somehow */
+ error_setg(pnd->errp,
+ "vhost-user backend not capable of postcopy");
+ return -ENOENT;
+ }
+ break;
+
+ case POSTCOPY_NOTIFY_INBOUND_ADVISE:
+ return vhost_user_postcopy_advise(dev, pnd->errp);
+
+ case POSTCOPY_NOTIFY_INBOUND_LISTEN:
+ return vhost_user_postcopy_listen(dev, pnd->errp);
+
+ case POSTCOPY_NOTIFY_INBOUND_END:
+ return vhost_user_postcopy_end(dev, pnd->errp);
+
+ default:
+ /* We ignore notifications we don't know */
+ break;
+ }
+
+ return 0;
+}
+
+static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
+ Error **errp)
+{
+ uint64_t features, ram_slots;
+ struct vhost_user *u;
+ VhostUserState *vus = (VhostUserState *) opaque;
+ int err;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ u = g_new0(struct vhost_user, 1);
+ u->user = vus;
+ u->dev = dev;
+ dev->opaque = u;
+
+ err = vhost_user_get_features(dev, &features);
+ if (err < 0) {
+ error_setg_errno(errp, -err, "vhost_backend_init failed");
+ return err;
+ }
+
+ if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ bool supports_f_config = vus->supports_config ||
+ (dev->config_ops && dev->config_ops->vhost_dev_config_notifier);
+ uint64_t protocol_features;
+
+ dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
+
+ err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
+ &protocol_features);
+ if (err < 0) {
+ error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
+ return -EPROTO;
+ }
+
+ /*
+ * We will use all the protocol features we support - although
+ * we suppress F_CONFIG if we know QEMUs internal code can not support
+ * it.
+ */
+ protocol_features &= VHOST_USER_PROTOCOL_FEATURE_MASK;
+
+ if (supports_f_config) {
+ if (!virtio_has_feature(protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIG)) {
+ error_setg(errp, "vhost-user device expecting "
+ "VHOST_USER_PROTOCOL_F_CONFIG but the vhost-user backend does "
+ "not support it.");
+ return -EPROTO;
+ }
+ } else {
+ if (virtio_has_feature(protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIG)) {
+ warn_reportf_err(*errp, "vhost-user backend supports "
+ "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not.");
+ protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
+ }
+ }
+
+ /* final set of protocol features */
+ dev->protocol_features = protocol_features;
+ err = vhost_user_set_protocol_features(dev, dev->protocol_features);
+ if (err < 0) {
+ error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
+ return -EPROTO;
+ }
+
+ /* query the max queues we support if backend supports Multiple Queue */
+ if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) {
+ err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
+ &dev->max_queues);
+ if (err < 0) {
+ error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
+ return -EPROTO;
+ }
+ } else {
+ dev->max_queues = 1;
+ }
+
+ if (dev->num_queues && dev->max_queues < dev->num_queues) {
+ error_setg(errp, "The maximum number of queues supported by the "
+ "backend is %" PRIu64, dev->max_queues);
+ return -EINVAL;
+ }
+
+ if (virtio_has_feature(features, VIRTIO_F_IOMMU_PLATFORM) &&
+ !(virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_SLAVE_REQ) &&
+ virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
+ error_setg(errp, "IOMMU support requires reply-ack and "
+ "slave-req protocol features.");
+ return -EINVAL;
+ }
+
+ /* get max memory regions if backend supports configurable RAM slots */
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
+ u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS;
+ } else {
+ err = vhost_user_get_max_memslots(dev, &ram_slots);
+ if (err < 0) {
+ error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
+ return -EPROTO;
+ }
+
+ if (ram_slots < u->user->memory_slots) {
+ error_setg(errp, "The backend specified a max ram slots limit "
+ "of %" PRIu64", when the prior validated limit was "
+ "%d. This limit should never decrease.", ram_slots,
+ u->user->memory_slots);
+ return -EINVAL;
+ }
+
+ u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS);
+ }
+ }
+
+ if (dev->migration_blocker == NULL &&
+ !virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD)) {
+ error_setg(&dev->migration_blocker,
+ "Migration disabled: vhost-user backend lacks "
+ "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
+ }
+
+ if (dev->vq_index == 0) {
+ err = vhost_setup_slave_channel(dev);
+ if (err < 0) {
+ error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
+ return -EPROTO;
+ }
+ }
+
+ u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
+ postcopy_add_notifier(&u->postcopy_notifier);
+
+ return 0;
+}
+
+static int vhost_user_backend_cleanup(struct vhost_dev *dev)
+{
+ struct vhost_user *u;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ u = dev->opaque;
+ if (u->postcopy_notifier.notify) {
+ postcopy_remove_notifier(&u->postcopy_notifier);
+ u->postcopy_notifier.notify = NULL;
+ }
+ u->postcopy_listen = false;
+ if (u->postcopy_fd.handler) {
+ postcopy_unregister_shared_ufd(&u->postcopy_fd);
+ close(u->postcopy_fd.fd);
+ u->postcopy_fd.handler = NULL;
+ }
+ if (u->slave_ioc) {
+ close_slave_channel(u);
+ }
+ g_free(u->region_rb);
+ u->region_rb = NULL;
+ g_free(u->region_rb_offset);
+ u->region_rb_offset = NULL;
+ u->region_rb_len = 0;
+ g_free(u);
+ dev->opaque = 0;
+
+ return 0;
+}
+
+static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
+{
+ assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
+
+ return idx;
+}
+
+static int vhost_user_memslots_limit(struct vhost_dev *dev)
+{
+ struct vhost_user *u = dev->opaque;
+
+ return u->user->memory_slots;
+}
+
+static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
+{
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ return virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+}
+
+static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
+{
+ VhostUserMsg msg = { };
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ /* If guest supports GUEST_ANNOUNCE do nothing */
+ if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) {
+ return 0;
+ }
+
+ /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */
+ if (virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_RARP)) {
+ msg.hdr.request = VHOST_USER_SEND_RARP;
+ msg.hdr.flags = VHOST_USER_VERSION;
+ memcpy((char *)&msg.payload.u64, mac_addr, 6);
+ msg.hdr.size = sizeof(msg.payload.u64);
+
+ return vhost_user_write(dev, &msg, NULL, 0);
+ }
+ return -ENOTSUP;
+}
+
+static bool vhost_user_can_merge(struct vhost_dev *dev,
+ uint64_t start1, uint64_t size1,
+ uint64_t start2, uint64_t size2)
+{
+ ram_addr_t offset;
+ int mfd, rfd;
+
+ (void)vhost_user_get_mr_data(start1, &offset, &mfd);
+ (void)vhost_user_get_mr_data(start2, &offset, &rfd);
+
+ return mfd == rfd;
+}
+
+static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
+{
+ VhostUserMsg msg;
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+ int ret;
+
+ if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))) {
+ return 0;
+ }
+
+ msg.hdr.request = VHOST_USER_NET_SET_MTU;
+ msg.payload.u64 = mtu;
+ msg.hdr.size = sizeof(msg.payload.u64);
+ msg.hdr.flags = VHOST_USER_VERSION;
+ if (reply_supported) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ /* If reply_ack supported, slave has to ack specified MTU is valid */
+ if (reply_supported) {
+ return process_message_reply(dev, &msg);
+ }
+
+ return 0;
+}
+
+static int vhost_user_send_device_iotlb_msg(struct vhost_dev *dev,
+ struct vhost_iotlb_msg *imsg)
+{
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_IOTLB_MSG,
+ .hdr.size = sizeof(msg.payload.iotlb),
+ .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
+ .payload.iotlb = *imsg,
+ };
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return process_message_reply(dev, &msg);
+}
+
+
+static void vhost_user_set_iotlb_callback(struct vhost_dev *dev, int enabled)
+{
+ /* No-op as the receive channel is not dedicated to IOTLB messages. */
+}
+
+static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
+ uint32_t config_len, Error **errp)
+{
+ int ret;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_GET_CONFIG,
+ .hdr.flags = VHOST_USER_VERSION,
+ .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
+ };
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIG)) {
+ error_setg(errp, "VHOST_USER_PROTOCOL_F_CONFIG not supported");
+ return -EINVAL;
+ }
+
+ assert(config_len <= VHOST_USER_MAX_CONFIG_SIZE);
+
+ msg.payload.config.offset = 0;
+ msg.payload.config.size = config_len;
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vhost_get_config failed");
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vhost_get_config failed");
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_GET_CONFIG) {
+ error_setg(errp,
+ "Received unexpected msg type. Expected %d received %d",
+ VHOST_USER_GET_CONFIG, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
+ error_setg(errp, "Received bad msg size.");
+ return -EPROTO;
+ }
+
+ memcpy(config, msg.payload.config.region, config_len);
+
+ return 0;
+}
+
+static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data,
+ uint32_t offset, uint32_t size, uint32_t flags)
+{
+ int ret;
+ uint8_t *p;
+ bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_CONFIG,
+ .hdr.flags = VHOST_USER_VERSION,
+ .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + size,
+ };
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CONFIG)) {
+ return -ENOTSUP;
+ }
+
+ if (reply_supported) {
+ msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+ }
+
+ if (size > VHOST_USER_MAX_CONFIG_SIZE) {
+ return -EINVAL;
+ }
+
+ msg.payload.config.offset = offset,
+ msg.payload.config.size = size,
+ msg.payload.config.flags = flags,
+ p = msg.payload.config.region;
+ memcpy(p, data, size);
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (reply_supported) {
+ return process_message_reply(dev, &msg);
+ }
+
+ return 0;
+}
+
+static int vhost_user_crypto_create_session(struct vhost_dev *dev,
+ void *session_info,
+ uint64_t *session_id)
+{
+ int ret;
+ bool crypto_session = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
+ CryptoDevBackendSymSessionInfo *sess_info = session_info;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_CREATE_CRYPTO_SESSION,
+ .hdr.flags = VHOST_USER_VERSION,
+ .hdr.size = sizeof(msg.payload.session),
+ };
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ if (!crypto_session) {
+ error_report("vhost-user trying to send unhandled ioctl");
+ return -ENOTSUP;
+ }
+
+ memcpy(&msg.payload.session.session_setup_data, sess_info,
+ sizeof(CryptoDevBackendSymSessionInfo));
+ if (sess_info->key_len) {
+ memcpy(&msg.payload.session.key, sess_info->cipher_key,
+ sess_info->key_len);
+ }
+ if (sess_info->auth_key_len > 0) {
+ memcpy(&msg.payload.session.auth_key, sess_info->auth_key,
+ sess_info->auth_key_len);
+ }
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_report("vhost_user_write() return %d, create session failed",
+ ret);
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ error_report("vhost_user_read() return %d, create session failed",
+ ret);
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_CREATE_CRYPTO_SESSION) {
+ error_report("Received unexpected msg type. Expected %d received %d",
+ VHOST_USER_CREATE_CRYPTO_SESSION, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size != sizeof(msg.payload.session)) {
+ error_report("Received bad msg size.");
+ return -EPROTO;
+ }
+
+ if (msg.payload.session.session_id < 0) {
+ error_report("Bad session id: %" PRId64 "",
+ msg.payload.session.session_id);
+ return -EINVAL;
+ }
+ *session_id = msg.payload.session.session_id;
+
+ return 0;
+}
+
+static int
+vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id)
+{
+ int ret;
+ bool crypto_session = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_CLOSE_CRYPTO_SESSION,
+ .hdr.flags = VHOST_USER_VERSION,
+ .hdr.size = sizeof(msg.payload.u64),
+ };
+ msg.payload.u64 = session_id;
+
+ if (!crypto_session) {
+ error_report("vhost-user trying to send unhandled ioctl");
+ return -ENOTSUP;
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ error_report("vhost_user_write() return %d, close session failed",
+ ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static bool vhost_user_mem_section_filter(struct vhost_dev *dev,
+ MemoryRegionSection *section)
+{
+ bool result;
+
+ result = memory_region_get_fd(section->mr) >= 0;
+
+ return result;
+}
+
+static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
+ uint16_t queue_size,
+ struct vhost_inflight *inflight)
+{
+ void *addr;
+ int fd;
+ int ret;
+ struct vhost_user *u = dev->opaque;
+ CharBackend *chr = u->user->chr;
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_GET_INFLIGHT_FD,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.inflight.num_queues = dev->nvqs,
+ .payload.inflight.queue_size = queue_size,
+ .hdr.size = sizeof(msg.payload.inflight),
+ };
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ ret = vhost_user_write(dev, &msg, NULL, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_user_read(dev, &msg);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msg.hdr.request != VHOST_USER_GET_INFLIGHT_FD) {
+ error_report("Received unexpected msg type. "
+ "Expected %d received %d",
+ VHOST_USER_GET_INFLIGHT_FD, msg.hdr.request);
+ return -EPROTO;
+ }
+
+ if (msg.hdr.size != sizeof(msg.payload.inflight)) {
+ error_report("Received bad msg size.");
+ return -EPROTO;
+ }
+
+ if (!msg.payload.inflight.mmap_size) {
+ return 0;
+ }
+
+ fd = qemu_chr_fe_get_msgfd(chr);
+ if (fd < 0) {
+ error_report("Failed to get mem fd");
+ return -EIO;
+ }
+
+ addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, msg.payload.inflight.mmap_offset);
+
+ if (addr == MAP_FAILED) {
+ error_report("Failed to mmap mem fd");
+ close(fd);
+ return -EFAULT;
+ }
+
+ inflight->addr = addr;
+ inflight->fd = fd;
+ inflight->size = msg.payload.inflight.mmap_size;
+ inflight->offset = msg.payload.inflight.mmap_offset;
+ inflight->queue_size = queue_size;
+
+ return 0;
+}
+
+static int vhost_user_set_inflight_fd(struct vhost_dev *dev,
+ struct vhost_inflight *inflight)
+{
+ VhostUserMsg msg = {
+ .hdr.request = VHOST_USER_SET_INFLIGHT_FD,
+ .hdr.flags = VHOST_USER_VERSION,
+ .payload.inflight.mmap_size = inflight->size,
+ .payload.inflight.mmap_offset = inflight->offset,
+ .payload.inflight.num_queues = dev->nvqs,
+ .payload.inflight.queue_size = inflight->queue_size,
+ .hdr.size = sizeof(msg.payload.inflight),
+ };
+
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+ return 0;
+ }
+
+ return vhost_user_write(dev, &msg, &inflight->fd, 1);
+}
+
+static void vhost_user_state_destroy(gpointer data)
+{
+ VhostUserHostNotifier *n = (VhostUserHostNotifier *) data;
+ if (n) {
+ vhost_user_host_notifier_remove(n, NULL);
+ object_unparent(OBJECT(&n->mr));
+ /*
+ * We can't free until vhost_user_host_notifier_remove has
+ * done it's thing so schedule the free with RCU.
+ */
+ g_free_rcu(n, rcu);
+ }
+}
+
+bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
+{
+ if (user->chr) {
+ error_setg(errp, "Cannot initialize vhost-user state");
+ return false;
+ }
+ user->chr = chr;
+ user->memory_slots = 0;
+ user->notifiers = g_ptr_array_new_full(VIRTIO_QUEUE_MAX / 4,
+ &vhost_user_state_destroy);
+ return true;
+}
+
+void vhost_user_cleanup(VhostUserState *user)
+{
+ if (!user->chr) {
+ return;
+ }
+ memory_region_transaction_begin();
+ user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true);
+ memory_region_transaction_commit();
+ user->chr = NULL;
+}
+
+
+typedef struct {
+ vu_async_close_fn cb;
+ DeviceState *dev;
+ CharBackend *cd;
+ struct vhost_dev *vhost;
+} VhostAsyncCallback;
+
+static void vhost_user_async_close_bh(void *opaque)
+{
+ VhostAsyncCallback *data = opaque;
+ struct vhost_dev *vhost = data->vhost;
+
+ /*
+ * If the vhost_dev has been cleared in the meantime there is
+ * nothing left to do as some other path has completed the
+ * cleanup.
+ */
+ if (vhost->vdev) {
+ data->cb(data->dev);
+ }
+
+ g_free(data);
+}
+
+/*
+ * We only schedule the work if the machine is running. If suspended
+ * we want to keep all the in-flight data as is for migration
+ * purposes.
+ */
+void vhost_user_async_close(DeviceState *d,
+ CharBackend *chardev, struct vhost_dev *vhost,
+ vu_async_close_fn cb)
+{
+ if (!runstate_check(RUN_STATE_SHUTDOWN)) {
+ /*
+ * A close event may happen during a read/write, but vhost
+ * code assumes the vhost_dev remains setup, so delay the
+ * stop & clear.
+ */
+ AioContext *ctx = qemu_get_current_aio_context();
+ VhostAsyncCallback *data = g_new0(VhostAsyncCallback, 1);
+
+ /* Save data for the callback */
+ data->cb = cb;
+ data->dev = d;
+ data->cd = chardev;
+ data->vhost = vhost;
+
+ /* Disable any further notifications on the chardev */
+ qemu_chr_fe_set_handlers(chardev,
+ NULL, NULL, NULL, NULL, NULL, NULL,
+ false);
+
+ aio_bh_schedule_oneshot(ctx, vhost_user_async_close_bh, data);
+
+ /*
+ * Move vhost device to the stopped state. The vhost-user device
+ * will be clean up and disconnected in BH. This can be useful in
+ * the vhost migration code. If disconnect was caught there is an
+ * option for the general vhost code to get the dev state without
+ * knowing its type (in this case vhost-user).
+ *
+ * Note if the vhost device is fully cleared by the time we
+ * execute the bottom half we won't continue with the cleanup.
+ */
+ vhost->started = false;
+ }
+}
+
+static int vhost_user_dev_start(struct vhost_dev *dev, bool started)
+{
+ if (!virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_STATUS)) {
+ return 0;
+ }
+
+ /* Set device status only for last queue pair */
+ if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+ return 0;
+ }
+
+ if (started) {
+ return vhost_user_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+ VIRTIO_CONFIG_S_DRIVER |
+ VIRTIO_CONFIG_S_DRIVER_OK);
+ } else {
+ return vhost_user_set_status(dev, 0);
+ }
+}
+
+const VhostOps user_ops = {
+ .backend_type = VHOST_BACKEND_TYPE_USER,
+ .vhost_backend_init = vhost_user_backend_init,
+ .vhost_backend_cleanup = vhost_user_backend_cleanup,
+ .vhost_backend_memslots_limit = vhost_user_memslots_limit,
+ .vhost_set_log_base = vhost_user_set_log_base,
+ .vhost_set_mem_table = vhost_user_set_mem_table,
+ .vhost_set_vring_addr = vhost_user_set_vring_addr,
+ .vhost_set_vring_endian = vhost_user_set_vring_endian,
+ .vhost_set_vring_num = vhost_user_set_vring_num,
+ .vhost_set_vring_base = vhost_user_set_vring_base,
+ .vhost_get_vring_base = vhost_user_get_vring_base,
+ .vhost_set_vring_kick = vhost_user_set_vring_kick,
+ .vhost_set_vring_call = vhost_user_set_vring_call,
+ .vhost_set_vring_err = vhost_user_set_vring_err,
+ .vhost_set_features = vhost_user_set_features,
+ .vhost_get_features = vhost_user_get_features,
+ .vhost_set_owner = vhost_user_set_owner,
+ .vhost_reset_device = vhost_user_reset_device,
+ .vhost_get_vq_index = vhost_user_get_vq_index,
+ .vhost_set_vring_enable = vhost_user_set_vring_enable,
+ .vhost_requires_shm_log = vhost_user_requires_shm_log,
+ .vhost_migration_done = vhost_user_migration_done,
+ .vhost_backend_can_merge = vhost_user_can_merge,
+ .vhost_net_set_mtu = vhost_user_net_set_mtu,
+ .vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
+ .vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
+ .vhost_get_config = vhost_user_get_config,
+ .vhost_set_config = vhost_user_set_config,
+ .vhost_crypto_create_session = vhost_user_crypto_create_session,
+ .vhost_crypto_close_session = vhost_user_crypto_close_session,
+ .vhost_backend_mem_section_filter = vhost_user_mem_section_filter,
+ .vhost_get_inflight_fd = vhost_user_get_inflight_fd,
+ .vhost_set_inflight_fd = vhost_user_set_inflight_fd,
+ .vhost_dev_start = vhost_user_dev_start,
+};
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
new file mode 100644
index 00000000..7468e44b
--- /dev/null
+++ b/hw/virtio/vhost-vdpa.c
@@ -0,0 +1,1313 @@
+/*
+ * vhost-vdpa
+ *
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * Copyright(c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include <linux/vhost.h>
+#include <linux/vfio.h>
+#include <sys/eventfd.h>
+#include <sys/ioctl.h>
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-backend.h"
+#include "hw/virtio/virtio-net.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+#include "hw/virtio/vhost-vdpa.h"
+#include "exec/address-spaces.h"
+#include "migration/blocker.h"
+#include "qemu/cutils.h"
+#include "qemu/main-loop.h"
+#include "cpu.h"
+#include "trace.h"
+#include "qapi/error.h"
+
+/*
+ * Return one past the end of the end of section. Be careful with uint64_t
+ * conversions!
+ */
+static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
+{
+ Int128 llend = int128_make64(section->offset_within_address_space);
+ llend = int128_add(llend, section->size);
+ llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
+
+ return llend;
+}
+
+static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
+ uint64_t iova_min,
+ uint64_t iova_max)
+{
+ Int128 llend;
+
+ if ((!memory_region_is_ram(section->mr) &&
+ !memory_region_is_iommu(section->mr)) ||
+ memory_region_is_protected(section->mr) ||
+ /* vhost-vDPA doesn't allow MMIO to be mapped */
+ memory_region_is_ram_device(section->mr)) {
+ return true;
+ }
+
+ if (section->offset_within_address_space < iova_min) {
+ error_report("RAM section out of device range (min=0x%" PRIx64
+ ", addr=0x%" HWADDR_PRIx ")",
+ iova_min, section->offset_within_address_space);
+ return true;
+ }
+
+ llend = vhost_vdpa_section_end(section);
+ if (int128_gt(llend, int128_make64(iova_max))) {
+ error_report("RAM section out of device range (max=0x%" PRIx64
+ ", end addr=0x%" PRIx64 ")",
+ iova_max, int128_get64(llend));
+ return true;
+ }
+
+ return false;
+}
+
+int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
+ void *vaddr, bool readonly)
+{
+ struct vhost_msg_v2 msg = {};
+ int fd = v->device_fd;
+ int ret = 0;
+
+ msg.type = v->msg_type;
+ msg.iotlb.iova = iova;
+ msg.iotlb.size = size;
+ msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
+ msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
+ msg.iotlb.type = VHOST_IOTLB_UPDATE;
+
+ trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
+ msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
+
+ if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
+ error_report("failed to write, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+ return -EIO ;
+ }
+
+ return ret;
+}
+
+int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
+{
+ struct vhost_msg_v2 msg = {};
+ int fd = v->device_fd;
+ int ret = 0;
+
+ msg.type = v->msg_type;
+ msg.iotlb.iova = iova;
+ msg.iotlb.size = size;
+ msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
+
+ trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
+ msg.iotlb.size, msg.iotlb.type);
+
+ if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
+ error_report("failed to write, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+ return -EIO ;
+ }
+
+ return ret;
+}
+
+static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
+{
+ int fd = v->device_fd;
+ struct vhost_msg_v2 msg = {
+ .type = v->msg_type,
+ .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
+ };
+
+ trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
+ if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
+ error_report("failed to write, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+ }
+}
+
+static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
+{
+ if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
+ !v->iotlb_batch_begin_sent) {
+ vhost_vdpa_listener_begin_batch(v);
+ }
+
+ v->iotlb_batch_begin_sent = true;
+}
+
+static void vhost_vdpa_listener_commit(MemoryListener *listener)
+{
+ struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+ struct vhost_dev *dev = v->dev;
+ struct vhost_msg_v2 msg = {};
+ int fd = v->device_fd;
+
+ if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
+ return;
+ }
+
+ if (!v->iotlb_batch_begin_sent) {
+ return;
+ }
+
+ msg.type = v->msg_type;
+ msg.iotlb.type = VHOST_IOTLB_BATCH_END;
+
+ trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
+ if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
+ error_report("failed to write, fd=%d, errno=%d (%s)",
+ fd, errno, strerror(errno));
+ }
+
+ v->iotlb_batch_begin_sent = false;
+}
+
+static void vhost_vdpa_listener_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ DMAMap mem_region = {};
+ struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+ hwaddr iova;
+ Int128 llend, llsize;
+ void *vaddr;
+ int ret;
+
+ if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
+ v->iova_range.last)) {
+ return;
+ }
+
+ if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+ (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+ error_report("%s received unaligned region", __func__);
+ return;
+ }
+
+ iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+ llend = vhost_vdpa_section_end(section);
+ if (int128_ge(int128_make64(iova), llend)) {
+ return;
+ }
+
+ memory_region_ref(section->mr);
+
+ /* Here we assume that memory_region_is_ram(section->mr)==true */
+
+ vaddr = memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region +
+ (iova - section->offset_within_address_space);
+
+ trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
+ vaddr, section->readonly);
+
+ llsize = int128_sub(llend, int128_make64(iova));
+ if (v->shadow_vqs_enabled) {
+ int r;
+
+ mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
+ mem_region.size = int128_get64(llsize) - 1,
+ mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+
+ r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+ if (unlikely(r != IOVA_OK)) {
+ error_report("Can't allocate a mapping (%d)", r);
+ goto fail;
+ }
+
+ iova = mem_region.iova;
+ }
+
+ vhost_vdpa_iotlb_batch_begin_once(v);
+ ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
+ vaddr, section->readonly);
+ if (ret) {
+ error_report("vhost vdpa map fail!");
+ goto fail_map;
+ }
+
+ return;
+
+fail_map:
+ if (v->shadow_vqs_enabled) {
+ vhost_iova_tree_remove(v->iova_tree, mem_region);
+ }
+
+fail:
+ /*
+ * On the initfn path, store the first error in the container so we
+ * can gracefully fail. Runtime, there's not much we can do other
+ * than throw a hardware error.
+ */
+ error_report("vhost-vdpa: DMA mapping failed, unable to continue");
+ return;
+
+}
+
+static void vhost_vdpa_listener_region_del(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
+ hwaddr iova;
+ Int128 llend, llsize;
+ int ret;
+
+ if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
+ v->iova_range.last)) {
+ return;
+ }
+
+ if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+ (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+ error_report("%s received unaligned region", __func__);
+ return;
+ }
+
+ iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+ llend = vhost_vdpa_section_end(section);
+
+ trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
+
+ if (int128_ge(int128_make64(iova), llend)) {
+ return;
+ }
+
+ llsize = int128_sub(llend, int128_make64(iova));
+
+ if (v->shadow_vqs_enabled) {
+ const DMAMap *result;
+ const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region +
+ (iova - section->offset_within_address_space);
+ DMAMap mem_region = {
+ .translated_addr = (hwaddr)(uintptr_t)vaddr,
+ .size = int128_get64(llsize) - 1,
+ };
+
+ result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+ if (!result) {
+ /* The memory listener map wasn't mapped */
+ return;
+ }
+ iova = result->iova;
+ vhost_iova_tree_remove(v->iova_tree, *result);
+ }
+ vhost_vdpa_iotlb_batch_begin_once(v);
+ ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
+ if (ret) {
+ error_report("vhost_vdpa dma unmap error!");
+ }
+
+ memory_region_unref(section->mr);
+}
+/*
+ * IOTLB API is used by vhost-vdpa which requires incremental updating
+ * of the mapping. So we can not use generic vhost memory listener which
+ * depends on the addnop().
+ */
+static const MemoryListener vhost_vdpa_memory_listener = {
+ .name = "vhost-vdpa",
+ .commit = vhost_vdpa_listener_commit,
+ .region_add = vhost_vdpa_listener_region_add,
+ .region_del = vhost_vdpa_listener_region_del,
+};
+
+static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
+ void *arg)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int fd = v->device_fd;
+ int ret;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
+
+ ret = ioctl(fd, request, arg);
+ return ret < 0 ? -errno : ret;
+}
+
+static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
+{
+ uint8_t s;
+ int ret;
+
+ trace_vhost_vdpa_add_status(dev, status);
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ s |= status;
+
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (!(s & status)) {
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
+{
+ int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
+ &v->iova_range);
+ if (ret != 0) {
+ v->iova_range.first = 0;
+ v->iova_range.last = UINT64_MAX;
+ }
+
+ trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
+ v->iova_range.last);
+}
+
+/*
+ * The use of this function is for requests that only need to be
+ * applied once. Typically such request occurs at the beginning
+ * of operation, and before setting up queues. It should not be
+ * used for request that performs operation until all queues are
+ * set, which would need to check dev->vq_index_end instead.
+ */
+static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+
+ return v->index == 0;
+}
+
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+ uint64_t *features)
+{
+ int ret;
+
+ ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+ trace_vhost_vdpa_get_features(dev, *features);
+ return ret;
+}
+
+static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
+ Error **errp)
+{
+ g_autoptr(GPtrArray) shadow_vqs = NULL;
+ uint64_t dev_features, svq_features;
+ int r;
+ bool ok;
+
+ if (!v->shadow_vqs_enabled) {
+ return 0;
+ }
+
+ r = vhost_vdpa_get_dev_features(hdev, &dev_features);
+ if (r != 0) {
+ error_setg_errno(errp, -r, "Can't get vdpa device features");
+ return r;
+ }
+
+ svq_features = dev_features;
+ ok = vhost_svq_valid_features(svq_features, errp);
+ if (unlikely(!ok)) {
+ return -1;
+ }
+
+ shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
+ for (unsigned n = 0; n < hdev->nvqs; ++n) {
+ g_autoptr(VhostShadowVirtqueue) svq;
+
+ svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
+ v->shadow_vq_ops_opaque);
+ if (unlikely(!svq)) {
+ error_setg(errp, "Cannot create svq %u", n);
+ return -1;
+ }
+ g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
+ }
+
+ v->shadow_vqs = g_steal_pointer(&shadow_vqs);
+ return 0;
+}
+
+static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
+{
+ struct vhost_vdpa *v;
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
+ trace_vhost_vdpa_init(dev, opaque);
+ int ret;
+
+ /*
+ * Similar to VFIO, we end up pinning all guest memory and have to
+ * disable discarding of RAM.
+ */
+ ret = ram_block_discard_disable(true);
+ if (ret) {
+ error_report("Cannot set discarding of RAM broken");
+ return ret;
+ }
+
+ v = opaque;
+ v->dev = dev;
+ dev->opaque = opaque ;
+ v->listener = vhost_vdpa_memory_listener;
+ v->msg_type = VHOST_IOTLB_MSG_V2;
+ ret = vhost_vdpa_init_svq(dev, v, errp);
+ if (ret) {
+ goto err;
+ }
+
+ vhost_vdpa_get_iova_range(v);
+
+ if (!vhost_vdpa_first_dev(dev)) {
+ return 0;
+ }
+
+ vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+ VIRTIO_CONFIG_S_DRIVER);
+
+ return 0;
+
+err:
+ ram_block_discard_disable(false);
+ return ret;
+}
+
+static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
+ int queue_index)
+{
+ size_t page_size = qemu_real_host_page_size();
+ struct vhost_vdpa *v = dev->opaque;
+ VirtIODevice *vdev = dev->vdev;
+ VhostVDPAHostNotifier *n;
+
+ n = &v->notifier[queue_index];
+
+ if (n->addr) {
+ virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
+ object_unparent(OBJECT(&n->mr));
+ munmap(n->addr, page_size);
+ n->addr = NULL;
+ }
+}
+
+static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
+{
+ size_t page_size = qemu_real_host_page_size();
+ struct vhost_vdpa *v = dev->opaque;
+ VirtIODevice *vdev = dev->vdev;
+ VhostVDPAHostNotifier *n;
+ int fd = v->device_fd;
+ void *addr;
+ char *name;
+
+ vhost_vdpa_host_notifier_uninit(dev, queue_index);
+
+ n = &v->notifier[queue_index];
+
+ addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
+ queue_index * page_size);
+ if (addr == MAP_FAILED) {
+ goto err;
+ }
+
+ name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
+ v, queue_index);
+ memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
+ page_size, addr);
+ g_free(name);
+
+ if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
+ object_unparent(OBJECT(&n->mr));
+ munmap(addr, page_size);
+ goto err;
+ }
+ n->addr = addr;
+
+ return 0;
+
+err:
+ return -1;
+}
+
+static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
+{
+ int i;
+
+ for (i = dev->vq_index; i < dev->vq_index + n; i++) {
+ vhost_vdpa_host_notifier_uninit(dev, i);
+ }
+}
+
+static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int i;
+
+ if (v->shadow_vqs_enabled) {
+ /* FIXME SVQ is not compatible with host notifiers mr */
+ return;
+ }
+
+ for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
+ if (vhost_vdpa_host_notifier_init(dev, i)) {
+ goto err;
+ }
+ }
+
+ return;
+
+err:
+ vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
+ return;
+}
+
+static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ size_t idx;
+
+ if (!v->shadow_vqs) {
+ return;
+ }
+
+ for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
+ }
+ g_ptr_array_free(v->shadow_vqs, true);
+}
+
+static int vhost_vdpa_cleanup(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v;
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
+ v = dev->opaque;
+ trace_vhost_vdpa_cleanup(dev, v);
+ vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
+ memory_listener_unregister(&v->listener);
+ vhost_vdpa_svq_cleanup(dev);
+
+ dev->opaque = NULL;
+ ram_block_discard_disable(false);
+
+ return 0;
+}
+
+static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
+{
+ trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
+ return INT_MAX;
+}
+
+static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
+ struct vhost_memory *mem)
+{
+ if (!vhost_vdpa_first_dev(dev)) {
+ return 0;
+ }
+
+ trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
+ if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
+ trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
+ int i;
+ for (i = 0; i < mem->nregions; i++) {
+ trace_vhost_vdpa_dump_regions(dev, i,
+ mem->regions[i].guest_phys_addr,
+ mem->regions[i].memory_size,
+ mem->regions[i].userspace_addr,
+ mem->regions[i].flags_padding);
+ }
+ }
+ if (mem->padding) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int vhost_vdpa_set_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int ret;
+
+ if (!vhost_vdpa_first_dev(dev)) {
+ return 0;
+ }
+
+ if (v->shadow_vqs_enabled) {
+ if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+ /*
+ * QEMU is just trying to enable or disable logging. SVQ handles
+ * this sepparately, so no need to forward this.
+ */
+ v->acked_features = features;
+ return 0;
+ }
+
+ v->acked_features = features;
+
+ /* We must not ack _F_LOG if SVQ is enabled */
+ features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+ }
+
+ trace_vhost_vdpa_set_features(dev, features);
+ ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
+ if (ret) {
+ return ret;
+ }
+
+ return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+}
+
+static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
+{
+ uint64_t features;
+ uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
+ 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
+ int r;
+
+ if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
+ return -EFAULT;
+ }
+
+ features &= f;
+
+ if (vhost_vdpa_first_dev(dev)) {
+ r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
+ if (r) {
+ return -EFAULT;
+ }
+ }
+
+ dev->backend_cap = features;
+
+ return 0;
+}
+
+static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
+ uint32_t *device_id)
+{
+ int ret;
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
+ trace_vhost_vdpa_get_device_id(dev, *device_id);
+ return ret;
+}
+
+static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
+{
+ if (!v->shadow_vqs_enabled) {
+ return;
+ }
+
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+ vhost_svq_stop(svq);
+ }
+}
+
+static int vhost_vdpa_reset_device(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int ret;
+ uint8_t status = 0;
+
+ vhost_vdpa_reset_svq(v);
+
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
+ trace_vhost_vdpa_reset_device(dev, status);
+ return ret;
+}
+
+static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
+{
+ assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
+
+ trace_vhost_vdpa_get_vq_index(dev, idx, idx);
+ return idx;
+}
+
+static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
+{
+ int i;
+ trace_vhost_vdpa_set_vring_ready(dev);
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_vring_state state = {
+ .index = dev->vq_index + i,
+ .num = 1,
+ };
+ vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
+ }
+ return 0;
+}
+
+static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
+ uint32_t config_len)
+{
+ int b, len;
+ char line[QEMU_HEXDUMP_LINE_LEN];
+
+ for (b = 0; b < config_len; b += 16) {
+ len = config_len - b;
+ qemu_hexdump_line(line, b, config, len, false);
+ trace_vhost_vdpa_dump_config(dev, line);
+ }
+}
+
+static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
+ uint32_t offset, uint32_t size,
+ uint32_t flags)
+{
+ struct vhost_vdpa_config *config;
+ int ret;
+ unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+
+ trace_vhost_vdpa_set_config(dev, offset, size, flags);
+ config = g_malloc(size + config_size);
+ config->off = offset;
+ config->len = size;
+ memcpy(config->buf, data, size);
+ if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
+ trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
+ vhost_vdpa_dump_config(dev, data, size);
+ }
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
+ g_free(config);
+ return ret;
+}
+
+static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
+ uint32_t config_len, Error **errp)
+{
+ struct vhost_vdpa_config *v_config;
+ unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+ int ret;
+
+ trace_vhost_vdpa_get_config(dev, config, config_len);
+ v_config = g_malloc(config_len + config_size);
+ v_config->len = config_len;
+ v_config->off = 0;
+ ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
+ memcpy(config, v_config->buf, config_len);
+ g_free(v_config);
+ if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
+ trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
+ vhost_vdpa_dump_config(dev, config, config_len);
+ }
+ return ret;
+ }
+
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
+static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
+}
+
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+ addr->desc_user_addr, addr->used_user_addr,
+ addr->avail_user_addr,
+ addr->log_guest_addr);
+
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
+/**
+ * Set the shadow virtqueue descriptors to the device
+ *
+ * @dev: The vhost device model
+ * @svq: The shadow virtqueue
+ * @idx: The index of the virtqueue in the vhost device
+ * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
+ */
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+ VhostShadowVirtqueue *svq, unsigned idx,
+ Error **errp)
+{
+ struct vhost_vring_file file = {
+ .index = dev->vq_index + idx,
+ };
+ const EventNotifier *event_notifier = &svq->hdev_kick;
+ int r;
+
+ file.fd = event_notifier_get_fd(event_notifier);
+ r = vhost_vdpa_set_vring_dev_kick(dev, &file);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Can't set device kick fd");
+ return r;
+ }
+
+ event_notifier = &svq->hdev_call;
+ file.fd = event_notifier_get_fd(event_notifier);
+ r = vhost_vdpa_set_vring_dev_call(dev, &file);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Can't set device call fd");
+ }
+
+ return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
+{
+ const DMAMap needle = {
+ .translated_addr = addr,
+ };
+ const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
+ hwaddr size;
+ int r;
+
+ if (unlikely(!result)) {
+ error_report("Unable to find SVQ address to unmap");
+ return;
+ }
+
+ size = ROUND_UP(result->size, qemu_real_host_page_size());
+ r = vhost_vdpa_dma_unmap(v, result->iova, size);
+ if (unlikely(r < 0)) {
+ error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
+ return;
+ }
+
+ vhost_iova_tree_remove(v->iova_tree, *result);
+}
+
+static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+ const VhostShadowVirtqueue *svq)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ struct vhost_vring_addr svq_addr;
+
+ vhost_svq_get_vring_addr(svq, &svq_addr);
+
+ vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
+
+ vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+ Error **errp)
+{
+ int r;
+
+ r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+ if (unlikely(r != IOVA_OK)) {
+ error_setg(errp, "Cannot allocate iova (%d)", r);
+ return false;
+ }
+
+ r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+ (void *)(uintptr_t)needle->translated_addr,
+ needle->perm == IOMMU_RO);
+ if (unlikely(r != 0)) {
+ error_setg_errno(errp, -r, "Cannot map region to device");
+ vhost_iova_tree_remove(v->iova_tree, *needle);
+ }
+
+ return r == 0;
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+ const VhostShadowVirtqueue *svq,
+ struct vhost_vring_addr *addr,
+ Error **errp)
+{
+ DMAMap device_region, driver_region;
+ struct vhost_vring_addr svq_addr;
+ struct vhost_vdpa *v = dev->opaque;
+ size_t device_size = vhost_svq_device_area_size(svq);
+ size_t driver_size = vhost_svq_driver_area_size(svq);
+ size_t avail_offset;
+ bool ok;
+
+ ERRP_GUARD();
+ vhost_svq_get_vring_addr(svq, &svq_addr);
+
+ driver_region = (DMAMap) {
+ .translated_addr = svq_addr.desc_user_addr,
+ .size = driver_size - 1,
+ .perm = IOMMU_RO,
+ };
+ ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+ if (unlikely(!ok)) {
+ error_prepend(errp, "Cannot create vq driver region: ");
+ return false;
+ }
+ addr->desc_user_addr = driver_region.iova;
+ avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+ addr->avail_user_addr = driver_region.iova + avail_offset;
+
+ device_region = (DMAMap) {
+ .translated_addr = svq_addr.used_user_addr,
+ .size = device_size - 1,
+ .perm = IOMMU_RW,
+ };
+ ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+ if (unlikely(!ok)) {
+ error_prepend(errp, "Cannot create vq device region: ");
+ vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
+ }
+ addr->used_user_addr = device_region.iova;
+
+ return ok;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+ VhostShadowVirtqueue *svq, unsigned idx,
+ Error **errp)
+{
+ uint16_t vq_index = dev->vq_index + idx;
+ struct vhost_vring_state s = {
+ .index = vq_index,
+ };
+ int r;
+
+ r = vhost_vdpa_set_dev_vring_base(dev, &s);
+ if (unlikely(r)) {
+ error_setg_errno(errp, -r, "Cannot set vring base");
+ return false;
+ }
+
+ r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
+ return r == 0;
+}
+
+static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ Error *err = NULL;
+ unsigned i;
+
+ if (!v->shadow_vqs) {
+ return true;
+ }
+
+ for (i = 0; i < v->shadow_vqs->len; ++i) {
+ VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+ struct vhost_vring_addr addr = {
+ .index = dev->vq_index + i,
+ };
+ int r;
+ bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
+ if (unlikely(!ok)) {
+ goto err;
+ }
+
+ vhost_svq_start(svq, dev->vdev, vq);
+ ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+ if (unlikely(!ok)) {
+ goto err_map;
+ }
+
+ /* Override vring GPA set by vhost subsystem */
+ r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+ if (unlikely(r != 0)) {
+ error_setg_errno(&err, -r, "Cannot set device address");
+ goto err_set_addr;
+ }
+ }
+
+ return true;
+
+err_set_addr:
+ vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+ vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+ error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+ for (unsigned j = 0; j < i; ++j) {
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+ vhost_vdpa_svq_unmap_rings(dev, svq);
+ vhost_svq_stop(svq);
+ }
+
+ return false;
+}
+
+static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+ struct vhost_vdpa *v = dev->opaque;
+
+ if (!v->shadow_vqs) {
+ return;
+ }
+
+ for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+ vhost_vdpa_svq_unmap_rings(dev, svq);
+ }
+}
+
+static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ bool ok;
+ trace_vhost_vdpa_dev_start(dev, started);
+
+ if (started) {
+ vhost_vdpa_host_notifiers_init(dev);
+ ok = vhost_vdpa_svqs_start(dev);
+ if (unlikely(!ok)) {
+ return -1;
+ }
+ vhost_vdpa_set_vring_ready(dev);
+ } else {
+ vhost_vdpa_svqs_stop(dev);
+ vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
+ }
+
+ if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
+ return 0;
+ }
+
+ if (started) {
+ memory_listener_register(&v->listener, &address_space_memory);
+ return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+ } else {
+ vhost_vdpa_reset_device(dev);
+ vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
+ VIRTIO_CONFIG_S_DRIVER);
+ memory_listener_unregister(&v->listener);
+
+ return 0;
+ }
+}
+
+static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
+ struct vhost_log *log)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
+ return 0;
+ }
+
+ trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
+ log->log);
+ return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
+}
+
+static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ struct vhost_vdpa *v = dev->opaque;
+
+ if (v->shadow_vqs_enabled) {
+ /*
+ * Device vring addr was set at device start. SVQ base is handled by
+ * VirtQueue code.
+ */
+ return 0;
+ }
+
+ return vhost_vdpa_set_vring_dev_addr(dev, addr);
+}
+
+static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
+ return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
+}
+
+static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
+
+ /*
+ * vhost-vdpa devices does not support in-flight requests. Set all of them
+ * as available.
+ *
+ * TODO: This is ok for networking, but other kinds of devices might
+ * have problems with these retransmissions.
+ */
+ while (virtqueue_rewind(vq, 1)) {
+ continue;
+ }
+ if (v->shadow_vqs_enabled) {
+ /*
+ * Device vring base was set at device start. SVQ base is handled by
+ * VirtQueue code.
+ */
+ return 0;
+ }
+
+ return vhost_vdpa_set_dev_vring_base(dev, ring);
+}
+
+static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int ret;
+
+ if (v->shadow_vqs_enabled) {
+ ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
+ return 0;
+ }
+
+ ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
+ trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
+ return ret;
+}
+
+static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int vdpa_idx = file->index - dev->vq_index;
+
+ if (v->shadow_vqs_enabled) {
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+ vhost_svq_set_svq_kick_fd(svq, file->fd);
+ return 0;
+ } else {
+ return vhost_vdpa_set_vring_dev_kick(dev, file);
+ }
+}
+
+static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ struct vhost_vdpa *v = dev->opaque;
+
+ if (v->shadow_vqs_enabled) {
+ int vdpa_idx = file->index - dev->vq_index;
+ VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+ vhost_svq_set_svq_call_fd(svq, file->fd);
+ return 0;
+ } else {
+ return vhost_vdpa_set_vring_dev_call(dev, file);
+ }
+}
+
+static int vhost_vdpa_get_features(struct vhost_dev *dev,
+ uint64_t *features)
+{
+ struct vhost_vdpa *v = dev->opaque;
+ int ret = vhost_vdpa_get_dev_features(dev, features);
+
+ if (ret == 0 && v->shadow_vqs_enabled) {
+ /* Add SVQ logging capabilities */
+ *features |= BIT_ULL(VHOST_F_LOG_ALL);
+ }
+
+ return ret;
+}
+
+static int vhost_vdpa_set_owner(struct vhost_dev *dev)
+{
+ if (!vhost_vdpa_first_dev(dev)) {
+ return 0;
+ }
+
+ trace_vhost_vdpa_set_owner(dev);
+ return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
+}
+
+static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
+{
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
+ addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
+ addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
+ addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
+ trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
+ addr->avail_user_addr, addr->used_user_addr);
+ return 0;
+}
+
+static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
+{
+ return true;
+}
+
+const VhostOps vdpa_ops = {
+ .backend_type = VHOST_BACKEND_TYPE_VDPA,
+ .vhost_backend_init = vhost_vdpa_init,
+ .vhost_backend_cleanup = vhost_vdpa_cleanup,
+ .vhost_set_log_base = vhost_vdpa_set_log_base,
+ .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
+ .vhost_set_vring_num = vhost_vdpa_set_vring_num,
+ .vhost_set_vring_base = vhost_vdpa_set_vring_base,
+ .vhost_get_vring_base = vhost_vdpa_get_vring_base,
+ .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
+ .vhost_set_vring_call = vhost_vdpa_set_vring_call,
+ .vhost_get_features = vhost_vdpa_get_features,
+ .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
+ .vhost_set_owner = vhost_vdpa_set_owner,
+ .vhost_set_vring_endian = NULL,
+ .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
+ .vhost_set_mem_table = vhost_vdpa_set_mem_table,
+ .vhost_set_features = vhost_vdpa_set_features,
+ .vhost_reset_device = vhost_vdpa_reset_device,
+ .vhost_get_vq_index = vhost_vdpa_get_vq_index,
+ .vhost_get_config = vhost_vdpa_get_config,
+ .vhost_set_config = vhost_vdpa_set_config,
+ .vhost_requires_shm_log = NULL,
+ .vhost_migration_done = NULL,
+ .vhost_backend_can_merge = NULL,
+ .vhost_net_set_mtu = NULL,
+ .vhost_set_iotlb_callback = NULL,
+ .vhost_send_device_iotlb_msg = NULL,
+ .vhost_dev_start = vhost_vdpa_dev_start,
+ .vhost_get_device_id = vhost_vdpa_get_device_id,
+ .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
+ .vhost_force_iommu = vhost_vdpa_force_iommu,
+};
diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c
new file mode 100644
index 00000000..d21c72b4
--- /dev/null
+++ b/hw/virtio/vhost-vsock-common.c
@@ -0,0 +1,300 @@
+/*
+ * Parent class for vhost-vsock devices
+ *
+ * Copyright 2015-2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "standard-headers/linux/virtio_vsock.h"
+#include "qapi/error.h"
+#include "hw/virtio/virtio-access.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-vsock.h"
+#include "qemu/iov.h"
+#include "monitor/monitor.h"
+
+const int feature_bits[] = {
+ VIRTIO_VSOCK_F_SEQPACKET,
+ VIRTIO_F_RING_RESET,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+uint64_t vhost_vsock_common_get_features(VirtIODevice *vdev, uint64_t features,
+ Error **errp)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ if (vvc->seqpacket != ON_OFF_AUTO_OFF) {
+ virtio_add_feature(&features, VIRTIO_VSOCK_F_SEQPACKET);
+ }
+
+ features = vhost_get_features(&vvc->vhost_dev, feature_bits, features);
+
+ if (vvc->seqpacket == ON_OFF_AUTO_ON &&
+ !virtio_has_feature(features, VIRTIO_VSOCK_F_SEQPACKET)) {
+ error_setg(errp, "vhost-vsock backend doesn't support seqpacket");
+ }
+
+ return features;
+}
+
+int vhost_vsock_common_start(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+ int i;
+
+ if (!k->set_guest_notifiers) {
+ error_report("binding does not support guest notifiers");
+ return -ENOSYS;
+ }
+
+ ret = vhost_dev_enable_notifiers(&vvc->vhost_dev, vdev);
+ if (ret < 0) {
+ error_report("Error enabling host notifiers: %d", -ret);
+ return ret;
+ }
+
+ ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, true);
+ if (ret < 0) {
+ error_report("Error binding guest notifier: %d", -ret);
+ goto err_host_notifiers;
+ }
+
+ vvc->vhost_dev.acked_features = vdev->guest_features;
+ ret = vhost_dev_start(&vvc->vhost_dev, vdev, true);
+ if (ret < 0) {
+ error_report("Error starting vhost: %d", -ret);
+ goto err_guest_notifiers;
+ }
+
+ /*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+ for (i = 0; i < vvc->vhost_dev.nvqs; i++) {
+ vhost_virtqueue_mask(&vvc->vhost_dev, vdev, i, false);
+ }
+
+ return 0;
+
+err_guest_notifiers:
+ k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false);
+err_host_notifiers:
+ vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev);
+ return ret;
+}
+
+void vhost_vsock_common_stop(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int ret;
+
+ if (!k->set_guest_notifiers) {
+ return;
+ }
+
+ vhost_dev_stop(&vvc->vhost_dev, vdev, true);
+
+ ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false);
+ if (ret < 0) {
+ error_report("vhost guest notifier cleanup failed: %d", ret);
+ return;
+ }
+
+ vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev);
+}
+
+
+static void vhost_vsock_common_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ /* Do nothing */
+}
+
+static void vhost_vsock_common_guest_notifier_mask(VirtIODevice *vdev, int idx,
+ bool mask)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ vhost_virtqueue_mask(&vvc->vhost_dev, vdev, idx, mask);
+}
+
+static bool vhost_vsock_common_guest_notifier_pending(VirtIODevice *vdev,
+ int idx)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ return vhost_virtqueue_pending(&vvc->vhost_dev, idx);
+}
+
+static void vhost_vsock_common_send_transport_reset(VHostVSockCommon *vvc)
+{
+ VirtQueueElement *elem;
+ VirtQueue *vq = vvc->event_vq;
+ struct virtio_vsock_event event = {
+ .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET),
+ };
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ error_report("vhost-vsock missed transport reset event");
+ return;
+ }
+
+ if (elem->out_num) {
+ error_report("invalid vhost-vsock event virtqueue element with "
+ "out buffers");
+ goto err;
+ }
+
+ if (iov_from_buf(elem->in_sg, elem->in_num, 0,
+ &event, sizeof(event)) != sizeof(event)) {
+ error_report("vhost-vsock event virtqueue element is too short");
+ goto err;
+ }
+
+ virtqueue_push(vq, elem, sizeof(event));
+ virtio_notify(VIRTIO_DEVICE(vvc), vq);
+
+ g_free(elem);
+ return;
+
+err:
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+}
+
+static void vhost_vsock_common_post_load_timer_cleanup(VHostVSockCommon *vvc)
+{
+ if (!vvc->post_load_timer) {
+ return;
+ }
+
+ timer_free(vvc->post_load_timer);
+ vvc->post_load_timer = NULL;
+}
+
+static void vhost_vsock_common_post_load_timer_cb(void *opaque)
+{
+ VHostVSockCommon *vvc = opaque;
+
+ vhost_vsock_common_post_load_timer_cleanup(vvc);
+ vhost_vsock_common_send_transport_reset(vvc);
+}
+
+int vhost_vsock_common_pre_save(void *opaque)
+{
+ VHostVSockCommon *vvc = opaque;
+
+ /*
+ * At this point, backend must be stopped, otherwise
+ * it might keep writing to memory.
+ */
+ assert(!vhost_dev_is_started(&vvc->vhost_dev));
+
+ return 0;
+}
+
+int vhost_vsock_common_post_load(void *opaque, int version_id)
+{
+ VHostVSockCommon *vvc = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(vvc);
+
+ if (virtio_queue_get_addr(vdev, 2)) {
+ /*
+ * Defer transport reset event to a vm clock timer so that virtqueue
+ * changes happen after migration has completed.
+ */
+ assert(!vvc->post_load_timer);
+ vvc->post_load_timer =
+ timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ vhost_vsock_common_post_load_timer_cb,
+ vvc);
+ timer_mod(vvc->post_load_timer, 1);
+ }
+ return 0;
+}
+
+void vhost_vsock_common_realize(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ virtio_init(vdev, VIRTIO_ID_VSOCK, sizeof(struct virtio_vsock_config));
+
+ /* Receive and transmit queues belong to vhost */
+ vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+ vhost_vsock_common_handle_output);
+ vvc->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+ vhost_vsock_common_handle_output);
+
+ /* The event queue belongs to QEMU */
+ vvc->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE,
+ vhost_vsock_common_handle_output);
+
+ vvc->vhost_dev.nvqs = ARRAY_SIZE(vvc->vhost_vqs);
+ vvc->vhost_dev.vqs = vvc->vhost_vqs;
+
+ vvc->post_load_timer = NULL;
+}
+
+void vhost_vsock_common_unrealize(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+
+ vhost_vsock_common_post_load_timer_cleanup(vvc);
+
+ virtio_delete_queue(vvc->recv_vq);
+ virtio_delete_queue(vvc->trans_vq);
+ virtio_delete_queue(vvc->event_vq);
+ virtio_cleanup(vdev);
+}
+
+static struct vhost_dev *vhost_vsock_common_get_vhost(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ return &vvc->vhost_dev;
+}
+
+static Property vhost_vsock_common_properties[] = {
+ DEFINE_PROP_ON_OFF_AUTO("seqpacket", VHostVSockCommon, seqpacket,
+ ON_OFF_AUTO_AUTO),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_vsock_common_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vhost_vsock_common_properties);
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->guest_notifier_mask = vhost_vsock_common_guest_notifier_mask;
+ vdc->guest_notifier_pending = vhost_vsock_common_guest_notifier_pending;
+ vdc->get_vhost = vhost_vsock_common_get_vhost;
+}
+
+static const TypeInfo vhost_vsock_common_info = {
+ .name = TYPE_VHOST_VSOCK_COMMON,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VHostVSockCommon),
+ .class_init = vhost_vsock_common_class_init,
+ .abstract = true,
+};
+
+static void vhost_vsock_common_register_types(void)
+{
+ type_register_static(&vhost_vsock_common_info);
+}
+
+type_init(vhost_vsock_common_register_types)
diff --git a/hw/virtio/vhost-vsock-pci.c b/hw/virtio/vhost-vsock-pci.c
new file mode 100644
index 00000000..9f34414d
--- /dev/null
+++ b/hw/virtio/vhost-vsock-pci.c
@@ -0,0 +1,96 @@
+/*
+ * Vhost vsock PCI Bindings
+ *
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-vsock.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VHostVSockPCI VHostVSockPCI;
+
+/*
+ * vhost-vsock-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_VSOCK_PCI "vhost-vsock-pci-base"
+DECLARE_INSTANCE_CHECKER(VHostVSockPCI, VHOST_VSOCK_PCI,
+ TYPE_VHOST_VSOCK_PCI)
+
+struct VHostVSockPCI {
+ VirtIOPCIProxy parent_obj;
+ VHostVSock vdev;
+};
+
+/* vhost-vsock-pci */
+
+static Property vhost_vsock_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VHostVSockPCI *dev = VHOST_VSOCK_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIODevice *virtio_dev = VIRTIO_DEVICE(vdev);
+
+ /*
+ * To avoid migration issues, we force virtio version 1 only when
+ * legacy check is enabled in the new machine types (>= 5.1).
+ */
+ if (!virtio_legacy_check_disabled(virtio_dev)) {
+ virtio_pci_force_virtio_1(vpci_dev);
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void vhost_vsock_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = vhost_vsock_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, vhost_vsock_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK;
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_vsock_pci_instance_init(Object *obj)
+{
+ VHostVSockPCI *dev = VHOST_VSOCK_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VHOST_VSOCK);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_vsock_pci_info = {
+ .base_name = TYPE_VHOST_VSOCK_PCI,
+ .generic_name = "vhost-vsock-pci",
+ .non_transitional_name = "vhost-vsock-pci-non-transitional",
+ .instance_size = sizeof(VHostVSockPCI),
+ .instance_init = vhost_vsock_pci_instance_init,
+ .class_init = vhost_vsock_pci_class_init,
+};
+
+static void virtio_pci_vhost_register(void)
+{
+ virtio_pci_types_register(&vhost_vsock_pci_info);
+}
+
+type_init(virtio_pci_vhost_register)
diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c
new file mode 100644
index 00000000..aa16d584
--- /dev/null
+++ b/hw/virtio/vhost-vsock.c
@@ -0,0 +1,239 @@
+/*
+ * Virtio vsock device
+ *
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "standard-headers/linux/virtio_vsock.h"
+#include "qapi/error.h"
+#include "hw/virtio/virtio-access.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-vsock.h"
+#include "monitor/monitor.h"
+
+static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VHostVSock *vsock = VHOST_VSOCK(vdev);
+ struct virtio_vsock_config vsockcfg = {};
+
+ virtio_stq_p(vdev, &vsockcfg.guest_cid, vsock->conf.guest_cid);
+ memcpy(config, &vsockcfg, sizeof(vsockcfg));
+}
+
+static int vhost_vsock_set_guest_cid(VirtIODevice *vdev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ VHostVSock *vsock = VHOST_VSOCK(vdev);
+ const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops;
+ int ret;
+
+ if (!vhost_ops->vhost_vsock_set_guest_cid) {
+ return -ENOSYS;
+ }
+
+ ret = vhost_ops->vhost_vsock_set_guest_cid(&vvc->vhost_dev,
+ vsock->conf.guest_cid);
+ if (ret < 0) {
+ return -errno;
+ }
+ return 0;
+}
+
+static int vhost_vsock_set_running(VirtIODevice *vdev, int start)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops;
+ int ret;
+
+ if (!vhost_ops->vhost_vsock_set_running) {
+ return -ENOSYS;
+ }
+
+ ret = vhost_ops->vhost_vsock_set_running(&vvc->vhost_dev, start);
+ if (ret < 0) {
+ return -errno;
+ }
+ return 0;
+}
+
+
+static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev);
+ bool should_start = virtio_device_should_start(vdev, status);
+ int ret;
+
+ if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) {
+ return;
+ }
+
+ if (should_start) {
+ ret = vhost_vsock_common_start(vdev);
+ if (ret < 0) {
+ return;
+ }
+
+ ret = vhost_vsock_set_running(vdev, 1);
+ if (ret < 0) {
+ vhost_vsock_common_stop(vdev);
+ error_report("Error starting vhost vsock: %d", -ret);
+ return;
+ }
+ } else {
+ ret = vhost_vsock_set_running(vdev, 0);
+ if (ret < 0) {
+ error_report("vhost vsock set running failed: %d", ret);
+ return;
+ }
+
+ vhost_vsock_common_stop(vdev);
+ }
+}
+
+static uint64_t vhost_vsock_get_features(VirtIODevice *vdev,
+ uint64_t requested_features,
+ Error **errp)
+{
+ return vhost_vsock_common_get_features(vdev, requested_features, errp);
+}
+
+static const VMStateDescription vmstate_virtio_vhost_vsock = {
+ .name = "virtio-vhost_vsock",
+ .minimum_version_id = VHOST_VSOCK_SAVEVM_VERSION,
+ .version_id = VHOST_VSOCK_SAVEVM_VERSION,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+ .pre_save = vhost_vsock_common_pre_save,
+ .post_load = vhost_vsock_common_post_load,
+};
+
+static void vhost_vsock_device_realize(DeviceState *dev, Error **errp)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VHostVSock *vsock = VHOST_VSOCK(dev);
+ int vhostfd;
+ int ret;
+
+ /* Refuse to use reserved CID numbers */
+ if (vsock->conf.guest_cid <= 2) {
+ error_setg(errp, "guest-cid property must be greater than 2");
+ return;
+ }
+
+ if (vsock->conf.guest_cid > UINT32_MAX) {
+ error_setg(errp, "guest-cid property must be a 32-bit number");
+ return;
+ }
+
+ if (vsock->conf.vhostfd) {
+ vhostfd = monitor_fd_param(monitor_cur(), vsock->conf.vhostfd, errp);
+ if (vhostfd == -1) {
+ error_prepend(errp, "vhost-vsock: unable to parse vhostfd: ");
+ return;
+ }
+
+ if (!g_unix_set_fd_nonblocking(vhostfd, true, NULL)) {
+ error_setg_errno(errp, errno,
+ "vhost-vsock: unable to set non-blocking mode");
+ return;
+ }
+ } else {
+ vhostfd = open("/dev/vhost-vsock", O_RDWR);
+ if (vhostfd < 0) {
+ error_setg_errno(errp, errno,
+ "vhost-vsock: failed to open vhost device");
+ return;
+ }
+
+ if (!g_unix_set_fd_nonblocking(vhostfd, true, NULL)) {
+ error_setg_errno(errp, errno,
+ "Failed to set FD nonblocking");
+ return;
+ }
+ }
+
+ vhost_vsock_common_realize(vdev);
+
+ ret = vhost_dev_init(&vvc->vhost_dev, (void *)(uintptr_t)vhostfd,
+ VHOST_BACKEND_TYPE_KERNEL, 0, errp);
+ if (ret < 0) {
+ /*
+ * vhostfd is closed by vhost_dev_cleanup, which is called
+ * by vhost_dev_init on initialization error.
+ */
+ goto err_virtio;
+ }
+
+ ret = vhost_vsock_set_guest_cid(vdev);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "vhost-vsock: unable to set guest cid");
+ goto err_vhost_dev;
+ }
+
+ return;
+
+err_vhost_dev:
+ /* vhost_dev_cleanup() closes the vhostfd passed to vhost_dev_init() */
+ vhost_dev_cleanup(&vvc->vhost_dev);
+err_virtio:
+ vhost_vsock_common_unrealize(vdev);
+}
+
+static void vhost_vsock_device_unrealize(DeviceState *dev)
+{
+ VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+
+ /* This will stop vhost backend if appropriate. */
+ vhost_vsock_set_status(vdev, 0);
+
+ vhost_dev_cleanup(&vvc->vhost_dev);
+ vhost_vsock_common_unrealize(vdev);
+}
+
+static Property vhost_vsock_properties[] = {
+ DEFINE_PROP_UINT64("guest-cid", VHostVSock, conf.guest_cid, 0),
+ DEFINE_PROP_STRING("vhostfd", VHostVSock, conf.vhostfd),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_vsock_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, vhost_vsock_properties);
+ dc->vmsd = &vmstate_virtio_vhost_vsock;
+ vdc->realize = vhost_vsock_device_realize;
+ vdc->unrealize = vhost_vsock_device_unrealize;
+ vdc->get_features = vhost_vsock_get_features;
+ vdc->get_config = vhost_vsock_get_config;
+ vdc->set_status = vhost_vsock_set_status;
+}
+
+static const TypeInfo vhost_vsock_info = {
+ .name = TYPE_VHOST_VSOCK,
+ .parent = TYPE_VHOST_VSOCK_COMMON,
+ .instance_size = sizeof(VHostVSock),
+ .class_init = vhost_vsock_class_init,
+};
+
+static void vhost_vsock_register_types(void)
+{
+ type_register_static(&vhost_vsock_info);
+}
+
+type_init(vhost_vsock_register_types)
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
new file mode 100644
index 00000000..7fb008bc
--- /dev/null
+++ b/hw/virtio/vhost.c
@@ -0,0 +1,1942 @@
+/*
+ * vhost support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ * Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/virtio/vhost.h"
+#include "qemu/atomic.h"
+#include "qemu/range.h"
+#include "qemu/error-report.h"
+#include "qemu/memfd.h"
+#include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "migration/blocker.h"
+#include "migration/qemu-file-types.h"
+#include "sysemu/dma.h"
+#include "trace.h"
+
+/* enabled until disconnected backend stabilizes */
+#define _VHOST_DEBUG 1
+
+#ifdef _VHOST_DEBUG
+#define VHOST_OPS_DEBUG(retval, fmt, ...) \
+ do { \
+ error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
+ strerror(-retval), -retval); \
+ } while (0)
+#else
+#define VHOST_OPS_DEBUG(retval, fmt, ...) \
+ do { } while (0)
+#endif
+
+static struct vhost_log *vhost_log;
+static struct vhost_log *vhost_log_shm;
+
+static unsigned int used_memslots;
+static QLIST_HEAD(, vhost_dev) vhost_devices =
+ QLIST_HEAD_INITIALIZER(vhost_devices);
+
+bool vhost_has_free_slot(void)
+{
+ unsigned int slots_limit = ~0U;
+ struct vhost_dev *hdev;
+
+ QLIST_FOREACH(hdev, &vhost_devices, entry) {
+ unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
+ slots_limit = MIN(slots_limit, r);
+ }
+ return slots_limit > used_memslots;
+}
+
+static void vhost_dev_sync_region(struct vhost_dev *dev,
+ MemoryRegionSection *section,
+ uint64_t mfirst, uint64_t mlast,
+ uint64_t rfirst, uint64_t rlast)
+{
+ vhost_log_chunk_t *log = dev->log->log;
+
+ uint64_t start = MAX(mfirst, rfirst);
+ uint64_t end = MIN(mlast, rlast);
+ vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
+ vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
+ uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
+
+ if (end < start) {
+ return;
+ }
+ assert(end / VHOST_LOG_CHUNK < dev->log_size);
+ assert(start / VHOST_LOG_CHUNK < dev->log_size);
+
+ for (;from < to; ++from) {
+ vhost_log_chunk_t log;
+ /* We first check with non-atomic: much cheaper,
+ * and we expect non-dirty to be the common case. */
+ if (!*from) {
+ addr += VHOST_LOG_CHUNK;
+ continue;
+ }
+ /* Data must be read atomically. We don't really need barrier semantics
+ * but it's easier to use atomic_* than roll our own. */
+ log = qatomic_xchg(from, 0);
+ while (log) {
+ int bit = ctzl(log);
+ hwaddr page_addr;
+ hwaddr section_offset;
+ hwaddr mr_offset;
+ page_addr = addr + bit * VHOST_LOG_PAGE;
+ section_offset = page_addr - section->offset_within_address_space;
+ mr_offset = section_offset + section->offset_within_region;
+ memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
+ log &= ~(0x1ull << bit);
+ }
+ addr += VHOST_LOG_CHUNK;
+ }
+}
+
+static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
+ MemoryRegionSection *section,
+ hwaddr first,
+ hwaddr last)
+{
+ int i;
+ hwaddr start_addr;
+ hwaddr end_addr;
+
+ if (!dev->log_enabled || !dev->started) {
+ return 0;
+ }
+ start_addr = section->offset_within_address_space;
+ end_addr = range_get_last(start_addr, int128_get64(section->size));
+ start_addr = MAX(first, start_addr);
+ end_addr = MIN(last, end_addr);
+
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ struct vhost_memory_region *reg = dev->mem->regions + i;
+ vhost_dev_sync_region(dev, section, start_addr, end_addr,
+ reg->guest_phys_addr,
+ range_get_last(reg->guest_phys_addr,
+ reg->memory_size));
+ }
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_virtqueue *vq = dev->vqs + i;
+
+ if (!vq->used_phys && !vq->used_size) {
+ continue;
+ }
+
+ vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
+ range_get_last(vq->used_phys, vq->used_size));
+ }
+ return 0;
+}
+
+static void vhost_log_sync(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ memory_listener);
+ vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
+}
+
+static void vhost_log_sync_range(struct vhost_dev *dev,
+ hwaddr first, hwaddr last)
+{
+ int i;
+ /* FIXME: this is N^2 in number of sections */
+ for (i = 0; i < dev->n_mem_sections; ++i) {
+ MemoryRegionSection *section = &dev->mem_sections[i];
+ vhost_sync_dirty_bitmap(dev, section, first, last);
+ }
+}
+
+static uint64_t vhost_get_log_size(struct vhost_dev *dev)
+{
+ uint64_t log_size = 0;
+ int i;
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ struct vhost_memory_region *reg = dev->mem->regions + i;
+ uint64_t last = range_get_last(reg->guest_phys_addr,
+ reg->memory_size);
+ log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
+ }
+ return log_size;
+}
+
+static int vhost_set_backend_type(struct vhost_dev *dev,
+ VhostBackendType backend_type)
+{
+ int r = 0;
+
+ switch (backend_type) {
+#ifdef CONFIG_VHOST_KERNEL
+ case VHOST_BACKEND_TYPE_KERNEL:
+ dev->vhost_ops = &kernel_ops;
+ break;
+#endif
+#ifdef CONFIG_VHOST_USER
+ case VHOST_BACKEND_TYPE_USER:
+ dev->vhost_ops = &user_ops;
+ break;
+#endif
+#ifdef CONFIG_VHOST_VDPA
+ case VHOST_BACKEND_TYPE_VDPA:
+ dev->vhost_ops = &vdpa_ops;
+ break;
+#endif
+ default:
+ error_report("Unknown vhost backend type");
+ r = -1;
+ }
+
+ return r;
+}
+
+static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
+{
+ Error *err = NULL;
+ struct vhost_log *log;
+ uint64_t logsize = size * sizeof(*(log->log));
+ int fd = -1;
+
+ log = g_new0(struct vhost_log, 1);
+ if (share) {
+ log->log = qemu_memfd_alloc("vhost-log", logsize,
+ F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+ &fd, &err);
+ if (err) {
+ error_report_err(err);
+ g_free(log);
+ return NULL;
+ }
+ memset(log->log, 0, logsize);
+ } else {
+ log->log = g_malloc0(logsize);
+ }
+
+ log->size = size;
+ log->refcnt = 1;
+ log->fd = fd;
+
+ return log;
+}
+
+static struct vhost_log *vhost_log_get(uint64_t size, bool share)
+{
+ struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+
+ if (!log || log->size != size) {
+ log = vhost_log_alloc(size, share);
+ if (share) {
+ vhost_log_shm = log;
+ } else {
+ vhost_log = log;
+ }
+ } else {
+ ++log->refcnt;
+ }
+
+ return log;
+}
+
+static void vhost_log_put(struct vhost_dev *dev, bool sync)
+{
+ struct vhost_log *log = dev->log;
+
+ if (!log) {
+ return;
+ }
+
+ --log->refcnt;
+ if (log->refcnt == 0) {
+ /* Sync only the range covered by the old log */
+ if (dev->log_size && sync) {
+ vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
+ }
+
+ if (vhost_log == log) {
+ g_free(log->log);
+ vhost_log = NULL;
+ } else if (vhost_log_shm == log) {
+ qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
+ log->fd);
+ vhost_log_shm = NULL;
+ }
+
+ g_free(log);
+ }
+
+ dev->log = NULL;
+ dev->log_size = 0;
+}
+
+static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
+{
+ return dev->vhost_ops->vhost_requires_shm_log &&
+ dev->vhost_ops->vhost_requires_shm_log(dev);
+}
+
+static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
+{
+ struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
+ uint64_t log_base = (uintptr_t)log->log;
+ int r;
+
+ /* inform backend of log switching, this must be done before
+ releasing the current log, to ensure no logging is lost */
+ r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
+ }
+
+ vhost_log_put(dev, true);
+ dev->log = log;
+ dev->log_size = size;
+}
+
+static bool vhost_dev_has_iommu(struct vhost_dev *dev)
+{
+ VirtIODevice *vdev = dev->vdev;
+
+ /*
+ * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
+ * incremental memory mapping API via IOTLB API. For platform that
+ * does not have IOMMU, there's no need to enable this feature
+ * which may cause unnecessary IOTLB miss/update transactions.
+ */
+ if (vdev) {
+ return virtio_bus_device_iommu_enabled(vdev) &&
+ virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+ } else {
+ return false;
+ }
+}
+
+static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
+ hwaddr *plen, bool is_write)
+{
+ if (!vhost_dev_has_iommu(dev)) {
+ return cpu_physical_memory_map(addr, plen, is_write);
+ } else {
+ return (void *)(uintptr_t)addr;
+ }
+}
+
+static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
+ hwaddr len, int is_write,
+ hwaddr access_len)
+{
+ if (!vhost_dev_has_iommu(dev)) {
+ cpu_physical_memory_unmap(buffer, len, is_write, access_len);
+ }
+}
+
+static int vhost_verify_ring_part_mapping(void *ring_hva,
+ uint64_t ring_gpa,
+ uint64_t ring_size,
+ void *reg_hva,
+ uint64_t reg_gpa,
+ uint64_t reg_size)
+{
+ uint64_t hva_ring_offset;
+ uint64_t ring_last = range_get_last(ring_gpa, ring_size);
+ uint64_t reg_last = range_get_last(reg_gpa, reg_size);
+
+ if (ring_last < reg_gpa || ring_gpa > reg_last) {
+ return 0;
+ }
+ /* check that whole ring's is mapped */
+ if (ring_last > reg_last) {
+ return -ENOMEM;
+ }
+ /* check that ring's MemoryRegion wasn't replaced */
+ hva_ring_offset = ring_gpa - reg_gpa;
+ if (ring_hva != reg_hva + hva_ring_offset) {
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int vhost_verify_ring_mappings(struct vhost_dev *dev,
+ void *reg_hva,
+ uint64_t reg_gpa,
+ uint64_t reg_size)
+{
+ int i, j;
+ int r = 0;
+ const char *part_name[] = {
+ "descriptor table",
+ "available ring",
+ "used ring"
+ };
+
+ if (vhost_dev_has_iommu(dev)) {
+ return 0;
+ }
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_virtqueue *vq = dev->vqs + i;
+
+ if (vq->desc_phys == 0) {
+ continue;
+ }
+
+ j = 0;
+ r = vhost_verify_ring_part_mapping(
+ vq->desc, vq->desc_phys, vq->desc_size,
+ reg_hva, reg_gpa, reg_size);
+ if (r) {
+ break;
+ }
+
+ j++;
+ r = vhost_verify_ring_part_mapping(
+ vq->avail, vq->avail_phys, vq->avail_size,
+ reg_hva, reg_gpa, reg_size);
+ if (r) {
+ break;
+ }
+
+ j++;
+ r = vhost_verify_ring_part_mapping(
+ vq->used, vq->used_phys, vq->used_size,
+ reg_hva, reg_gpa, reg_size);
+ if (r) {
+ break;
+ }
+ }
+
+ if (r == -ENOMEM) {
+ error_report("Unable to map %s for ring %d", part_name[j], i);
+ } else if (r == -EBUSY) {
+ error_report("%s relocated for ring %d", part_name[j], i);
+ }
+ return r;
+}
+
+/*
+ * vhost_section: identify sections needed for vhost access
+ *
+ * We only care about RAM sections here (where virtqueue and guest
+ * internals accessed by virtio might live). If we find one we still
+ * allow the backend to potentially filter it out of our list.
+ */
+static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
+{
+ MemoryRegion *mr = section->mr;
+
+ if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
+ uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
+ uint8_t handled_dirty;
+
+ /*
+ * Kernel based vhost doesn't handle any block which is doing
+ * dirty-tracking other than migration for which it has
+ * specific logging support. However for TCG the kernel never
+ * gets involved anyway so we can also ignore it's
+ * self-modiying code detection flags. However a vhost-user
+ * client could still confuse a TCG guest if it re-writes
+ * executable memory that has already been translated.
+ */
+ handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
+ (1 << DIRTY_MEMORY_CODE);
+
+ if (dirty_mask & ~handled_dirty) {
+ trace_vhost_reject_section(mr->name, 1);
+ return false;
+ }
+
+ if (dev->vhost_ops->vhost_backend_mem_section_filter &&
+ !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
+ trace_vhost_reject_section(mr->name, 2);
+ return false;
+ }
+
+ trace_vhost_section(mr->name);
+ return true;
+ } else {
+ trace_vhost_reject_section(mr->name, 3);
+ return false;
+ }
+}
+
+static void vhost_begin(MemoryListener *listener)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ memory_listener);
+ dev->tmp_sections = NULL;
+ dev->n_tmp_sections = 0;
+}
+
+static void vhost_commit(MemoryListener *listener)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ memory_listener);
+ MemoryRegionSection *old_sections;
+ int n_old_sections;
+ uint64_t log_size;
+ size_t regions_size;
+ int r;
+ int i;
+ bool changed = false;
+
+ /* Note we can be called before the device is started, but then
+ * starting the device calls set_mem_table, so we need to have
+ * built the data structures.
+ */
+ old_sections = dev->mem_sections;
+ n_old_sections = dev->n_mem_sections;
+ dev->mem_sections = dev->tmp_sections;
+ dev->n_mem_sections = dev->n_tmp_sections;
+
+ if (dev->n_mem_sections != n_old_sections) {
+ changed = true;
+ } else {
+ /* Same size, lets check the contents */
+ for (int i = 0; i < n_old_sections; i++) {
+ if (!MemoryRegionSection_eq(&old_sections[i],
+ &dev->mem_sections[i])) {
+ changed = true;
+ break;
+ }
+ }
+ }
+
+ trace_vhost_commit(dev->started, changed);
+ if (!changed) {
+ goto out;
+ }
+
+ /* Rebuild the regions list from the new sections list */
+ regions_size = offsetof(struct vhost_memory, regions) +
+ dev->n_mem_sections * sizeof dev->mem->regions[0];
+ dev->mem = g_realloc(dev->mem, regions_size);
+ dev->mem->nregions = dev->n_mem_sections;
+ used_memslots = dev->mem->nregions;
+ for (i = 0; i < dev->n_mem_sections; i++) {
+ struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
+ struct MemoryRegionSection *mrs = dev->mem_sections + i;
+
+ cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
+ cur_vmr->memory_size = int128_get64(mrs->size);
+ cur_vmr->userspace_addr =
+ (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
+ mrs->offset_within_region;
+ cur_vmr->flags_padding = 0;
+ }
+
+ if (!dev->started) {
+ goto out;
+ }
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ if (vhost_verify_ring_mappings(dev,
+ (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
+ dev->mem->regions[i].guest_phys_addr,
+ dev->mem->regions[i].memory_size)) {
+ error_report("Verify ring failure on region %d", i);
+ abort();
+ }
+ }
+
+ if (!dev->log_enabled) {
+ r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
+ }
+ goto out;
+ }
+ log_size = vhost_get_log_size(dev);
+ /* We allocate an extra 4K bytes to log,
+ * to reduce the * number of reallocations. */
+#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
+ /* To log more, must increase log size before table update. */
+ if (dev->log_size < log_size) {
+ vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
+ }
+ r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
+ }
+ /* To log less, can only decrease log size after table update. */
+ if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
+ vhost_dev_log_resize(dev, log_size);
+ }
+
+out:
+ /* Deref the old list of sections, this must happen _after_ the
+ * vhost_set_mem_table to ensure the client isn't still using the
+ * section we're about to unref.
+ */
+ while (n_old_sections--) {
+ memory_region_unref(old_sections[n_old_sections].mr);
+ }
+ g_free(old_sections);
+ return;
+}
+
+/* Adds the section data to the tmp_section structure.
+ * It relies on the listener calling us in memory address order
+ * and for each region (via the _add and _nop methods) to
+ * join neighbours.
+ */
+static void vhost_region_add_section(struct vhost_dev *dev,
+ MemoryRegionSection *section)
+{
+ bool need_add = true;
+ uint64_t mrs_size = int128_get64(section->size);
+ uint64_t mrs_gpa = section->offset_within_address_space;
+ uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ RAMBlock *mrs_rb = section->mr->ram_block;
+
+ trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
+ mrs_host);
+
+ if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
+ /* Round the section to it's page size */
+ /* First align the start down to a page boundary */
+ size_t mrs_page = qemu_ram_pagesize(mrs_rb);
+ uint64_t alignage = mrs_host & (mrs_page - 1);
+ if (alignage) {
+ mrs_host -= alignage;
+ mrs_size += alignage;
+ mrs_gpa -= alignage;
+ }
+ /* Now align the size up to a page boundary */
+ alignage = mrs_size & (mrs_page - 1);
+ if (alignage) {
+ mrs_size += mrs_page - alignage;
+ }
+ trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
+ mrs_size, mrs_host);
+ }
+
+ if (dev->n_tmp_sections) {
+ /* Since we already have at least one section, lets see if
+ * this extends it; since we're scanning in order, we only
+ * have to look at the last one, and the FlatView that calls
+ * us shouldn't have overlaps.
+ */
+ MemoryRegionSection *prev_sec = dev->tmp_sections +
+ (dev->n_tmp_sections - 1);
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
+ uint64_t prev_size = int128_get64(prev_sec->size);
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
+ uint64_t prev_host_start =
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
+ prev_sec->offset_within_region;
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
+
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
+ /* OK, looks like overlapping/intersecting - it's possible that
+ * the rounding to page sizes has made them overlap, but they should
+ * match up in the same RAMBlock if they do.
+ */
+ if (mrs_gpa < prev_gpa_start) {
+ error_report("%s:Section '%s' rounded to %"PRIx64
+ " prior to previous '%s' %"PRIx64,
+ __func__, section->mr->name, mrs_gpa,
+ prev_sec->mr->name, prev_gpa_start);
+ /* A way to cleanly fail here would be better */
+ return;
+ }
+ /* Offset from the start of the previous GPA to this GPA */
+ size_t offset = mrs_gpa - prev_gpa_start;
+
+ if (prev_host_start + offset == mrs_host &&
+ section->mr == prev_sec->mr &&
+ (!dev->vhost_ops->vhost_backend_can_merge ||
+ dev->vhost_ops->vhost_backend_can_merge(dev,
+ mrs_host, mrs_size,
+ prev_host_start, prev_size))) {
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
+ need_add = false;
+ prev_sec->offset_within_address_space =
+ MIN(prev_gpa_start, mrs_gpa);
+ prev_sec->offset_within_region =
+ MIN(prev_host_start, mrs_host) -
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
+ mrs_host));
+ trace_vhost_region_add_section_merge(section->mr->name,
+ int128_get64(prev_sec->size),
+ prev_sec->offset_within_address_space,
+ prev_sec->offset_within_region);
+ } else {
+ /* adjoining regions are fine, but overlapping ones with
+ * different blocks/offsets shouldn't happen
+ */
+ if (mrs_gpa != prev_gpa_end + 1) {
+ error_report("%s: Overlapping but not coherent sections "
+ "at %"PRIx64,
+ __func__, mrs_gpa);
+ return;
+ }
+ }
+ }
+ }
+
+ if (need_add) {
+ ++dev->n_tmp_sections;
+ dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
+ dev->n_tmp_sections);
+ dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
+ /* The flatview isn't stable and we don't use it, making it NULL
+ * means we can memcmp the list.
+ */
+ dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
+ memory_region_ref(section->mr);
+ }
+}
+
+/* Used for both add and nop callbacks */
+static void vhost_region_addnop(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ memory_listener);
+
+ if (!vhost_section(dev, section)) {
+ return;
+ }
+ vhost_region_add_section(dev, section);
+}
+
+static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+ struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
+ struct vhost_dev *hdev = iommu->hdev;
+ hwaddr iova = iotlb->iova + iommu->iommu_offset;
+
+ if (vhost_backend_invalidate_device_iotlb(hdev, iova,
+ iotlb->addr_mask + 1)) {
+ error_report("Fail to invalidate device iotlb");
+ }
+}
+
+static void vhost_iommu_region_add(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ iommu_listener);
+ struct vhost_iommu *iommu;
+ Int128 end;
+ int iommu_idx;
+ IOMMUMemoryRegion *iommu_mr;
+ int ret;
+
+ if (!memory_region_is_iommu(section->mr)) {
+ return;
+ }
+
+ iommu_mr = IOMMU_MEMORY_REGION(section->mr);
+
+ iommu = g_malloc0(sizeof(*iommu));
+ end = int128_add(int128_make64(section->offset_within_region),
+ section->size);
+ end = int128_sub(end, int128_one());
+ iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
+ MEMTXATTRS_UNSPECIFIED);
+ iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
+ IOMMU_NOTIFIER_DEVIOTLB_UNMAP,
+ section->offset_within_region,
+ int128_get64(end),
+ iommu_idx);
+ iommu->mr = section->mr;
+ iommu->iommu_offset = section->offset_within_address_space -
+ section->offset_within_region;
+ iommu->hdev = dev;
+ ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
+ if (ret) {
+ /*
+ * Some vIOMMUs do not support dev-iotlb yet. If so, try to use the
+ * UNMAP legacy message
+ */
+ iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
+ memory_region_register_iommu_notifier(section->mr, &iommu->n,
+ &error_fatal);
+ }
+ QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
+ /* TODO: can replay help performance here? */
+}
+
+static void vhost_iommu_region_del(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ iommu_listener);
+ struct vhost_iommu *iommu;
+
+ if (!memory_region_is_iommu(section->mr)) {
+ return;
+ }
+
+ QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
+ if (iommu->mr == section->mr &&
+ iommu->n.start == section->offset_within_region) {
+ memory_region_unregister_iommu_notifier(iommu->mr,
+ &iommu->n);
+ QLIST_REMOVE(iommu, iommu_next);
+ g_free(iommu);
+ break;
+ }
+ }
+}
+
+static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq,
+ unsigned idx, bool enable_log)
+{
+ struct vhost_vring_addr addr;
+ int r;
+ memset(&addr, 0, sizeof(struct vhost_vring_addr));
+
+ if (dev->vhost_ops->vhost_vq_get_addr) {
+ r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
+ return r;
+ }
+ } else {
+ addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
+ addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
+ addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
+ }
+ addr.index = idx;
+ addr.log_guest_addr = vq->used_phys;
+ addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
+ r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
+ }
+ return r;
+}
+
+static int vhost_dev_set_features(struct vhost_dev *dev,
+ bool enable_log)
+{
+ uint64_t features = dev->acked_features;
+ int r;
+ if (enable_log) {
+ features |= 0x1ULL << VHOST_F_LOG_ALL;
+ }
+ if (!vhost_dev_has_iommu(dev)) {
+ features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
+ }
+ if (dev->vhost_ops->vhost_force_iommu) {
+ if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
+ features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
+ }
+ }
+ r = dev->vhost_ops->vhost_set_features(dev, features);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_features failed");
+ goto out;
+ }
+ if (dev->vhost_ops->vhost_set_backend_cap) {
+ r = dev->vhost_ops->vhost_set_backend_cap(dev);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
+ goto out;
+ }
+ }
+
+out:
+ return r;
+}
+
+static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
+{
+ int r, i, idx;
+ hwaddr addr;
+
+ r = vhost_dev_set_features(dev, enable_log);
+ if (r < 0) {
+ goto err_features;
+ }
+ for (i = 0; i < dev->nvqs; ++i) {
+ idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+ addr = virtio_queue_get_desc_addr(dev->vdev, idx);
+ if (!addr) {
+ /*
+ * The queue might not be ready for start. If this
+ * is the case there is no reason to continue the process.
+ * The similar logic is used by the vhost_virtqueue_start()
+ * routine.
+ */
+ continue;
+ }
+ r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
+ enable_log);
+ if (r < 0) {
+ goto err_vq;
+ }
+ }
+ return 0;
+err_vq:
+ for (; i >= 0; --i) {
+ idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+ addr = virtio_queue_get_desc_addr(dev->vdev, idx);
+ if (!addr) {
+ continue;
+ }
+ vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
+ dev->log_enabled);
+ }
+ vhost_dev_set_features(dev, dev->log_enabled);
+err_features:
+ return r;
+}
+
+static int vhost_migration_log(MemoryListener *listener, bool enable)
+{
+ struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+ memory_listener);
+ int r;
+ if (enable == dev->log_enabled) {
+ return 0;
+ }
+ if (!dev->started) {
+ dev->log_enabled = enable;
+ return 0;
+ }
+
+ r = 0;
+ if (!enable) {
+ r = vhost_dev_set_log(dev, false);
+ if (r < 0) {
+ goto check_dev_state;
+ }
+ vhost_log_put(dev, false);
+ } else {
+ vhost_dev_log_resize(dev, vhost_get_log_size(dev));
+ r = vhost_dev_set_log(dev, true);
+ if (r < 0) {
+ goto check_dev_state;
+ }
+ }
+
+check_dev_state:
+ dev->log_enabled = enable;
+ /*
+ * vhost-user-* devices could change their state during log
+ * initialization due to disconnect. So check dev state after
+ * vhost communication.
+ */
+ if (!dev->started) {
+ /*
+ * Since device is in the stopped state, it is okay for
+ * migration. Return success.
+ */
+ r = 0;
+ }
+ if (r) {
+ /* An error occurred. */
+ dev->log_enabled = false;
+ }
+
+ return r;
+}
+
+static void vhost_log_global_start(MemoryListener *listener)
+{
+ int r;
+
+ r = vhost_migration_log(listener, true);
+ if (r < 0) {
+ abort();
+ }
+}
+
+static void vhost_log_global_stop(MemoryListener *listener)
+{
+ int r;
+
+ r = vhost_migration_log(listener, false);
+ if (r < 0) {
+ abort();
+ }
+}
+
+static void vhost_log_start(MemoryListener *listener,
+ MemoryRegionSection *section,
+ int old, int new)
+{
+ /* FIXME: implement */
+}
+
+static void vhost_log_stop(MemoryListener *listener,
+ MemoryRegionSection *section,
+ int old, int new)
+{
+ /* FIXME: implement */
+}
+
+/* The vhost driver natively knows how to handle the vrings of non
+ * cross-endian legacy devices and modern devices. Only legacy devices
+ * exposed to a bi-endian guest may require the vhost driver to use a
+ * specific endianness.
+ */
+static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ return false;
+ }
+#if HOST_BIG_ENDIAN
+ return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
+#else
+ return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+#endif
+}
+
+static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
+ bool is_big_endian,
+ int vhost_vq_index)
+{
+ int r;
+ struct vhost_vring_state s = {
+ .index = vhost_vq_index,
+ .num = is_big_endian
+ };
+
+ r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
+ }
+ return r;
+}
+
+static int vhost_memory_region_lookup(struct vhost_dev *hdev,
+ uint64_t gpa, uint64_t *uaddr,
+ uint64_t *len)
+{
+ int i;
+
+ for (i = 0; i < hdev->mem->nregions; i++) {
+ struct vhost_memory_region *reg = hdev->mem->regions + i;
+
+ if (gpa >= reg->guest_phys_addr &&
+ reg->guest_phys_addr + reg->memory_size > gpa) {
+ *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
+ *len = reg->guest_phys_addr + reg->memory_size - gpa;
+ return 0;
+ }
+ }
+
+ return -EFAULT;
+}
+
+int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
+{
+ IOMMUTLBEntry iotlb;
+ uint64_t uaddr, len;
+ int ret = -EFAULT;
+
+ RCU_READ_LOCK_GUARD();
+
+ trace_vhost_iotlb_miss(dev, 1);
+
+ iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
+ iova, write,
+ MEMTXATTRS_UNSPECIFIED);
+ if (iotlb.target_as != NULL) {
+ ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
+ &uaddr, &len);
+ if (ret) {
+ trace_vhost_iotlb_miss(dev, 3);
+ error_report("Fail to lookup the translated address "
+ "%"PRIx64, iotlb.translated_addr);
+ goto out;
+ }
+
+ len = MIN(iotlb.addr_mask + 1, len);
+ iova = iova & ~iotlb.addr_mask;
+
+ ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
+ len, iotlb.perm);
+ if (ret) {
+ trace_vhost_iotlb_miss(dev, 4);
+ error_report("Fail to update device iotlb");
+ goto out;
+ }
+ }
+
+ trace_vhost_iotlb_miss(dev, 2);
+
+out:
+ return ret;
+}
+
+int vhost_virtqueue_start(struct vhost_dev *dev,
+ struct VirtIODevice *vdev,
+ struct vhost_virtqueue *vq,
+ unsigned idx)
+{
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusState *vbus = VIRTIO_BUS(qbus);
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
+ hwaddr s, l, a;
+ int r;
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
+ struct vhost_vring_file file = {
+ .index = vhost_vq_index
+ };
+ struct vhost_vring_state state = {
+ .index = vhost_vq_index
+ };
+ struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
+
+ a = virtio_queue_get_desc_addr(vdev, idx);
+ if (a == 0) {
+ /* Queue might not be ready for start */
+ return 0;
+ }
+
+ vq->num = state.num = virtio_queue_get_num(vdev, idx);
+ r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
+ return r;
+ }
+
+ state.num = virtio_queue_get_last_avail_idx(vdev, idx);
+ r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
+ return r;
+ }
+
+ if (vhost_needs_vring_endian(vdev)) {
+ r = vhost_virtqueue_set_vring_endian_legacy(dev,
+ virtio_is_big_endian(vdev),
+ vhost_vq_index);
+ if (r) {
+ return r;
+ }
+ }
+
+ vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
+ vq->desc_phys = a;
+ vq->desc = vhost_memory_map(dev, a, &l, false);
+ if (!vq->desc || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc_desc;
+ }
+ vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
+ vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
+ vq->avail = vhost_memory_map(dev, a, &l, false);
+ if (!vq->avail || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc_avail;
+ }
+ vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
+ vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
+ vq->used = vhost_memory_map(dev, a, &l, true);
+ if (!vq->used || l != s) {
+ r = -ENOMEM;
+ goto fail_alloc_used;
+ }
+
+ r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
+ if (r < 0) {
+ goto fail_alloc;
+ }
+
+ file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
+ r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
+ goto fail_kick;
+ }
+
+ /* Clear and discard previous events if any. */
+ event_notifier_test_and_clear(&vq->masked_notifier);
+
+ /* Init vring in unmasked state, unless guest_notifier_mask
+ * will do it later.
+ */
+ if (!vdev->use_guest_notifier_mask) {
+ /* TODO: check and handle errors. */
+ vhost_virtqueue_mask(dev, vdev, idx, false);
+ }
+
+ if (k->query_guest_notifiers &&
+ k->query_guest_notifiers(qbus->parent) &&
+ virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
+ file.fd = -1;
+ r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
+ if (r) {
+ goto fail_vector;
+ }
+ }
+
+ return 0;
+
+fail_vector:
+fail_kick:
+fail_alloc:
+ vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
+ 0, 0);
+fail_alloc_used:
+ vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+ 0, 0);
+fail_alloc_avail:
+ vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+ 0, 0);
+fail_alloc_desc:
+ return r;
+}
+
+void vhost_virtqueue_stop(struct vhost_dev *dev,
+ struct VirtIODevice *vdev,
+ struct vhost_virtqueue *vq,
+ unsigned idx)
+{
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
+ struct vhost_vring_state state = {
+ .index = vhost_vq_index,
+ };
+ int r;
+
+ if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
+ /* Don't stop the virtqueue which might have not been started */
+ return;
+ }
+
+ r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
+ /* Connection to the backend is broken, so let's sync internal
+ * last avail idx to the device used idx.
+ */
+ virtio_queue_restore_last_avail_idx(vdev, idx);
+ } else {
+ virtio_queue_set_last_avail_idx(vdev, idx, state.num);
+ }
+ virtio_queue_invalidate_signalled_used(vdev, idx);
+ virtio_queue_update_used_idx(vdev, idx);
+
+ /* In the cross-endian case, we need to reset the vring endianness to
+ * native as legacy devices expect so by default.
+ */
+ if (vhost_needs_vring_endian(vdev)) {
+ vhost_virtqueue_set_vring_endian_legacy(dev,
+ !virtio_is_big_endian(vdev),
+ vhost_vq_index);
+ }
+
+ vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
+ 1, virtio_queue_get_used_size(vdev, idx));
+ vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+ 0, virtio_queue_get_avail_size(vdev, idx));
+ vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+ 0, virtio_queue_get_desc_size(vdev, idx));
+}
+
+static void vhost_eventfd_add(MemoryListener *listener,
+ MemoryRegionSection *section,
+ bool match_data, uint64_t data, EventNotifier *e)
+{
+}
+
+static void vhost_eventfd_del(MemoryListener *listener,
+ MemoryRegionSection *section,
+ bool match_data, uint64_t data, EventNotifier *e)
+{
+}
+
+static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
+ int n, uint32_t timeout)
+{
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
+ struct vhost_vring_state state = {
+ .index = vhost_vq_index,
+ .num = timeout,
+ };
+ int r;
+
+ if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
+ return -EINVAL;
+ }
+
+ r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
+ return r;
+ }
+
+ return 0;
+}
+
+static void vhost_virtqueue_error_notifier(EventNotifier *n)
+{
+ struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
+ error_notifier);
+ struct vhost_dev *dev = vq->dev;
+ int index = vq - dev->vqs;
+
+ if (event_notifier_test_and_clear(n) && dev->vdev) {
+ VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d",
+ dev->vq_index + index);
+ }
+}
+
+static int vhost_virtqueue_init(struct vhost_dev *dev,
+ struct vhost_virtqueue *vq, int n)
+{
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
+ struct vhost_vring_file file = {
+ .index = vhost_vq_index,
+ };
+ int r = event_notifier_init(&vq->masked_notifier, 0);
+ if (r < 0) {
+ return r;
+ }
+
+ file.fd = event_notifier_get_wfd(&vq->masked_notifier);
+ r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
+ goto fail_call;
+ }
+
+ vq->dev = dev;
+
+ if (dev->vhost_ops->vhost_set_vring_err) {
+ r = event_notifier_init(&vq->error_notifier, 0);
+ if (r < 0) {
+ goto fail_call;
+ }
+
+ file.fd = event_notifier_get_fd(&vq->error_notifier);
+ r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
+ goto fail_err;
+ }
+
+ event_notifier_set_handler(&vq->error_notifier,
+ vhost_virtqueue_error_notifier);
+ }
+
+ return 0;
+
+fail_err:
+ event_notifier_cleanup(&vq->error_notifier);
+fail_call:
+ event_notifier_cleanup(&vq->masked_notifier);
+ return r;
+}
+
+static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
+{
+ event_notifier_cleanup(&vq->masked_notifier);
+ if (vq->dev->vhost_ops->vhost_set_vring_err) {
+ event_notifier_set_handler(&vq->error_notifier, NULL);
+ event_notifier_cleanup(&vq->error_notifier);
+ }
+}
+
+int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
+ VhostBackendType backend_type, uint32_t busyloop_timeout,
+ Error **errp)
+{
+ uint64_t features;
+ int i, r, n_initialized_vqs = 0;
+
+ hdev->vdev = NULL;
+ hdev->migration_blocker = NULL;
+
+ r = vhost_set_backend_type(hdev, backend_type);
+ assert(r >= 0);
+
+ r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
+ if (r < 0) {
+ goto fail;
+ }
+
+ r = hdev->vhost_ops->vhost_set_owner(hdev);
+ if (r < 0) {
+ error_setg_errno(errp, -r, "vhost_set_owner failed");
+ goto fail;
+ }
+
+ r = hdev->vhost_ops->vhost_get_features(hdev, &features);
+ if (r < 0) {
+ error_setg_errno(errp, -r, "vhost_get_features failed");
+ goto fail;
+ }
+
+ for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
+ r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
+ if (r < 0) {
+ error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
+ goto fail;
+ }
+ }
+
+ if (busyloop_timeout) {
+ for (i = 0; i < hdev->nvqs; ++i) {
+ r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
+ busyloop_timeout);
+ if (r < 0) {
+ error_setg_errno(errp, -r, "Failed to set busyloop timeout");
+ goto fail_busyloop;
+ }
+ }
+ }
+
+ hdev->features = features;
+
+ hdev->memory_listener = (MemoryListener) {
+ .name = "vhost",
+ .begin = vhost_begin,
+ .commit = vhost_commit,
+ .region_add = vhost_region_addnop,
+ .region_nop = vhost_region_addnop,
+ .log_start = vhost_log_start,
+ .log_stop = vhost_log_stop,
+ .log_sync = vhost_log_sync,
+ .log_global_start = vhost_log_global_start,
+ .log_global_stop = vhost_log_global_stop,
+ .eventfd_add = vhost_eventfd_add,
+ .eventfd_del = vhost_eventfd_del,
+ .priority = 10
+ };
+
+ hdev->iommu_listener = (MemoryListener) {
+ .name = "vhost-iommu",
+ .region_add = vhost_iommu_region_add,
+ .region_del = vhost_iommu_region_del,
+ };
+
+ if (hdev->migration_blocker == NULL) {
+ if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
+ error_setg(&hdev->migration_blocker,
+ "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
+ } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
+ error_setg(&hdev->migration_blocker,
+ "Migration disabled: failed to allocate shared memory");
+ }
+ }
+
+ if (hdev->migration_blocker != NULL) {
+ r = migrate_add_blocker(hdev->migration_blocker, errp);
+ if (r < 0) {
+ error_free(hdev->migration_blocker);
+ goto fail_busyloop;
+ }
+ }
+
+ hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
+ hdev->n_mem_sections = 0;
+ hdev->mem_sections = NULL;
+ hdev->log = NULL;
+ hdev->log_size = 0;
+ hdev->log_enabled = false;
+ hdev->started = false;
+ memory_listener_register(&hdev->memory_listener, &address_space_memory);
+ QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
+
+ if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
+ error_setg(errp, "vhost backend memory slots limit is less"
+ " than current number of present memory slots");
+ r = -EINVAL;
+ goto fail_busyloop;
+ }
+
+ return 0;
+
+fail_busyloop:
+ if (busyloop_timeout) {
+ while (--i >= 0) {
+ vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
+ }
+ }
+fail:
+ hdev->nvqs = n_initialized_vqs;
+ vhost_dev_cleanup(hdev);
+ return r;
+}
+
+void vhost_dev_cleanup(struct vhost_dev *hdev)
+{
+ int i;
+
+ trace_vhost_dev_cleanup(hdev);
+
+ for (i = 0; i < hdev->nvqs; ++i) {
+ vhost_virtqueue_cleanup(hdev->vqs + i);
+ }
+ if (hdev->mem) {
+ /* those are only safe after successful init */
+ memory_listener_unregister(&hdev->memory_listener);
+ QLIST_REMOVE(hdev, entry);
+ }
+ if (hdev->migration_blocker) {
+ migrate_del_blocker(hdev->migration_blocker);
+ error_free(hdev->migration_blocker);
+ }
+ g_free(hdev->mem);
+ g_free(hdev->mem_sections);
+ if (hdev->vhost_ops) {
+ hdev->vhost_ops->vhost_backend_cleanup(hdev);
+ }
+ assert(!hdev->log);
+
+ memset(hdev, 0, sizeof(struct vhost_dev));
+}
+
+/* Stop processing guest IO notifications in qemu.
+ * Start processing them in vhost in kernel.
+ */
+int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ int i, r, e;
+
+ /* We will pass the notifiers to the kernel, make sure that QEMU
+ * doesn't interfere.
+ */
+ r = virtio_device_grab_ioeventfd(vdev);
+ if (r < 0) {
+ error_report("binding does not support host notifiers");
+ goto fail;
+ }
+
+ for (i = 0; i < hdev->nvqs; ++i) {
+ r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
+ true);
+ if (r < 0) {
+ error_report("vhost VQ %d notifier binding failed: %d", i, -r);
+ goto fail_vq;
+ }
+ }
+
+ return 0;
+fail_vq:
+ while (--i >= 0) {
+ e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
+ false);
+ if (e < 0) {
+ error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
+ }
+ assert (e >= 0);
+ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
+ }
+ virtio_device_release_ioeventfd(vdev);
+fail:
+ return r;
+}
+
+/* Stop processing guest IO notifications in vhost.
+ * Start processing them in qemu.
+ * This might actually run the qemu handlers right away,
+ * so virtio in qemu must be completely setup when this is called.
+ */
+void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ int i, r;
+
+ for (i = 0; i < hdev->nvqs; ++i) {
+ r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
+ false);
+ if (r < 0) {
+ error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
+ }
+ assert (r >= 0);
+ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
+ }
+ virtio_device_release_ioeventfd(vdev);
+}
+
+/* Test and clear event pending status.
+ * Should be called after unmask to avoid losing events.
+ */
+bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
+{
+ struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
+ assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
+ return event_notifier_test_and_clear(&vq->masked_notifier);
+}
+
+/* Mask/unmask events from this vq. */
+void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
+ bool mask)
+{
+ struct VirtQueue *vvq = virtio_get_queue(vdev, n);
+ int r, index = n - hdev->vq_index;
+ struct vhost_vring_file file;
+
+ /* should only be called after backend is connected */
+ assert(hdev->vhost_ops);
+
+ if (mask) {
+ assert(vdev->use_guest_notifier_mask);
+ file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
+ } else {
+ file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
+ }
+
+ file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
+ r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
+ }
+}
+
+uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
+ uint64_t features)
+{
+ const int *bit = feature_bits;
+ while (*bit != VHOST_INVALID_FEATURE_BIT) {
+ uint64_t bit_mask = (1ULL << *bit);
+ if (!(hdev->features & bit_mask)) {
+ features &= ~bit_mask;
+ }
+ bit++;
+ }
+ return features;
+}
+
+void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
+ uint64_t features)
+{
+ const int *bit = feature_bits;
+ while (*bit != VHOST_INVALID_FEATURE_BIT) {
+ uint64_t bit_mask = (1ULL << *bit);
+ if (features & bit_mask) {
+ hdev->acked_features |= bit_mask;
+ }
+ bit++;
+ }
+}
+
+int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
+ uint32_t config_len, Error **errp)
+{
+ assert(hdev->vhost_ops);
+
+ if (hdev->vhost_ops->vhost_get_config) {
+ return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
+ errp);
+ }
+
+ error_setg(errp, "vhost_get_config not implemented");
+ return -ENOSYS;
+}
+
+int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
+ uint32_t offset, uint32_t size, uint32_t flags)
+{
+ assert(hdev->vhost_ops);
+
+ if (hdev->vhost_ops->vhost_set_config) {
+ return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
+ size, flags);
+ }
+
+ return -ENOSYS;
+}
+
+void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
+ const VhostDevConfigOps *ops)
+{
+ hdev->config_ops = ops;
+}
+
+void vhost_dev_free_inflight(struct vhost_inflight *inflight)
+{
+ if (inflight && inflight->addr) {
+ qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
+ inflight->addr = NULL;
+ inflight->fd = -1;
+ }
+}
+
+static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
+ uint64_t new_size)
+{
+ Error *err = NULL;
+ int fd = -1;
+ void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
+ F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+ &fd, &err);
+
+ if (err) {
+ error_report_err(err);
+ return -ENOMEM;
+ }
+
+ vhost_dev_free_inflight(inflight);
+ inflight->offset = 0;
+ inflight->addr = addr;
+ inflight->fd = fd;
+ inflight->size = new_size;
+
+ return 0;
+}
+
+void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
+{
+ if (inflight->addr) {
+ qemu_put_be64(f, inflight->size);
+ qemu_put_be16(f, inflight->queue_size);
+ qemu_put_buffer(f, inflight->addr, inflight->size);
+ } else {
+ qemu_put_be64(f, 0);
+ }
+}
+
+int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
+{
+ uint64_t size;
+
+ size = qemu_get_be64(f);
+ if (!size) {
+ return 0;
+ }
+
+ if (inflight->size != size) {
+ int ret = vhost_dev_resize_inflight(inflight, size);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ inflight->queue_size = qemu_get_be16(f);
+
+ qemu_get_buffer(f, inflight->addr, size);
+
+ return 0;
+}
+
+int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+ int r;
+
+ if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
+ hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
+ return 0;
+ }
+
+ hdev->vdev = vdev;
+
+ r = vhost_dev_set_features(hdev, hdev->log_enabled);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
+ return r;
+ }
+
+ return 0;
+}
+
+int vhost_dev_set_inflight(struct vhost_dev *dev,
+ struct vhost_inflight *inflight)
+{
+ int r;
+
+ if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
+ r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
+ struct vhost_inflight *inflight)
+{
+ int r;
+
+ if (dev->vhost_ops->vhost_get_inflight_fd) {
+ r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
+ if (r) {
+ VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
+{
+ if (!hdev->vhost_ops->vhost_set_vring_enable) {
+ return 0;
+ }
+
+ /*
+ * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
+ * been negotiated, the rings start directly in the enabled state, and
+ * .vhost_set_vring_enable callback will fail since
+ * VHOST_USER_SET_VRING_ENABLE is not supported.
+ */
+ if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
+ !virtio_has_feature(hdev->backend_features,
+ VHOST_USER_F_PROTOCOL_FEATURES)) {
+ return 0;
+ }
+
+ return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
+}
+
+/* Host notifiers must be enabled at this point. */
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
+{
+ int i, r;
+
+ /* should only be called after backend is connected */
+ assert(hdev->vhost_ops);
+
+ trace_vhost_dev_start(hdev, vdev->name, vrings);
+
+ vdev->vhost_started = true;
+ hdev->started = true;
+ hdev->vdev = vdev;
+
+ r = vhost_dev_set_features(hdev, hdev->log_enabled);
+ if (r < 0) {
+ goto fail_features;
+ }
+
+ if (vhost_dev_has_iommu(hdev)) {
+ memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
+ }
+
+ r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
+ goto fail_mem;
+ }
+ for (i = 0; i < hdev->nvqs; ++i) {
+ r = vhost_virtqueue_start(hdev,
+ vdev,
+ hdev->vqs + i,
+ hdev->vq_index + i);
+ if (r < 0) {
+ goto fail_vq;
+ }
+ }
+
+ if (hdev->log_enabled) {
+ uint64_t log_base;
+
+ hdev->log_size = vhost_get_log_size(hdev);
+ hdev->log = vhost_log_get(hdev->log_size,
+ vhost_dev_log_is_shared(hdev));
+ log_base = (uintptr_t)hdev->log->log;
+ r = hdev->vhost_ops->vhost_set_log_base(hdev,
+ hdev->log_size ? log_base : 0,
+ hdev->log);
+ if (r < 0) {
+ VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
+ goto fail_log;
+ }
+ }
+ if (vrings) {
+ r = vhost_dev_set_vring_enable(hdev, true);
+ if (r) {
+ goto fail_log;
+ }
+ }
+ if (hdev->vhost_ops->vhost_dev_start) {
+ r = hdev->vhost_ops->vhost_dev_start(hdev, true);
+ if (r) {
+ goto fail_start;
+ }
+ }
+ if (vhost_dev_has_iommu(hdev) &&
+ hdev->vhost_ops->vhost_set_iotlb_callback) {
+ hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
+
+ /* Update used ring information for IOTLB to work correctly,
+ * vhost-kernel code requires for this.*/
+ for (i = 0; i < hdev->nvqs; ++i) {
+ struct vhost_virtqueue *vq = hdev->vqs + i;
+ vhost_device_iotlb_miss(hdev, vq->used_phys, true);
+ }
+ }
+ return 0;
+fail_start:
+ if (vrings) {
+ vhost_dev_set_vring_enable(hdev, false);
+ }
+fail_log:
+ vhost_log_put(hdev, false);
+fail_vq:
+ while (--i >= 0) {
+ vhost_virtqueue_stop(hdev,
+ vdev,
+ hdev->vqs + i,
+ hdev->vq_index + i);
+ }
+
+fail_mem:
+fail_features:
+ vdev->vhost_started = false;
+ hdev->started = false;
+ return r;
+}
+
+/* Host notifiers must be enabled at this point. */
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
+{
+ int i;
+
+ /* should only be called after backend is connected */
+ assert(hdev->vhost_ops);
+
+ trace_vhost_dev_stop(hdev, vdev->name, vrings);
+
+ if (hdev->vhost_ops->vhost_dev_start) {
+ hdev->vhost_ops->vhost_dev_start(hdev, false);
+ }
+ if (vrings) {
+ vhost_dev_set_vring_enable(hdev, false);
+ }
+ for (i = 0; i < hdev->nvqs; ++i) {
+ vhost_virtqueue_stop(hdev,
+ vdev,
+ hdev->vqs + i,
+ hdev->vq_index + i);
+ }
+
+ if (vhost_dev_has_iommu(hdev)) {
+ if (hdev->vhost_ops->vhost_set_iotlb_callback) {
+ hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
+ }
+ memory_listener_unregister(&hdev->iommu_listener);
+ }
+ vhost_log_put(hdev, true);
+ hdev->started = false;
+ vdev->vhost_started = false;
+ hdev->vdev = NULL;
+}
+
+int vhost_net_set_backend(struct vhost_dev *hdev,
+ struct vhost_vring_file *file)
+{
+ if (hdev->vhost_ops->vhost_net_set_backend) {
+ return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
+ }
+
+ return -ENOSYS;
+}
diff --git a/hw/virtio/virtio-9p-pci.c b/hw/virtio/virtio-9p-pci.c
new file mode 100644
index 00000000..94c14f0b
--- /dev/null
+++ b/hw/virtio/virtio-9p-pci.c
@@ -0,0 +1,91 @@
+/*
+ * Virtio 9p PCI Bindings
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/9pfs/virtio-9p.h"
+#include "hw/qdev-properties.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+/*
+ * virtio-9p-pci: This extends VirtioPCIProxy.
+ */
+
+#define TYPE_VIRTIO_9P_PCI "virtio-9p-pci-base"
+typedef struct V9fsPCIState V9fsPCIState;
+DECLARE_INSTANCE_CHECKER(V9fsPCIState, VIRTIO_9P_PCI,
+ TYPE_VIRTIO_9P_PCI)
+
+struct V9fsPCIState {
+ VirtIOPCIProxy parent_obj;
+ V9fsVirtioState vdev;
+};
+
+static void virtio_9p_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ V9fsPCIState *dev = VIRTIO_9P_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static Property virtio_9p_pci_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_9p_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+
+ k->realize = virtio_9p_pci_realize;
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_9P;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = 0x2;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, virtio_9p_pci_properties);
+}
+
+static void virtio_9p_pci_instance_init(Object *obj)
+{
+ V9fsPCIState *dev = VIRTIO_9P_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_9P);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_9p_pci_info = {
+ .base_name = TYPE_VIRTIO_9P_PCI,
+ .generic_name = "virtio-9p-pci",
+ .transitional_name = "virtio-9p-pci-transitional",
+ .non_transitional_name = "virtio-9p-pci-non-transitional",
+ .instance_size = sizeof(V9fsPCIState),
+ .instance_init = virtio_9p_pci_instance_init,
+ .class_init = virtio_9p_pci_class_init,
+};
+
+static void virtio_9p_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_9p_pci_info);
+}
+
+type_init(virtio_9p_pci_register)
diff --git a/hw/virtio/virtio-balloon-pci.c b/hw/virtio/virtio-balloon-pci.c
new file mode 100644
index 00000000..ce2645ba
--- /dev/null
+++ b/hw/virtio/virtio-balloon-pci.c
@@ -0,0 +1,88 @@
+/*
+ * Virtio balloon PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-balloon.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIOBalloonPCI VirtIOBalloonPCI;
+
+/*
+ * virtio-balloon-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_BALLOON_PCI "virtio-balloon-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOBalloonPCI, VIRTIO_BALLOON_PCI,
+ TYPE_VIRTIO_BALLOON_PCI)
+
+struct VirtIOBalloonPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOBalloon vdev;
+};
+
+static void virtio_balloon_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOBalloonPCI *dev = VIRTIO_BALLOON_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ vpci_dev->class_code = PCI_CLASS_OTHERS;
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_balloon_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = virtio_balloon_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BALLOON;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void virtio_balloon_pci_instance_init(Object *obj)
+{
+ VirtIOBalloonPCI *dev = VIRTIO_BALLOON_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_BALLOON);
+ object_property_add_alias(obj, "guest-stats", OBJECT(&dev->vdev),
+ "guest-stats");
+ object_property_add_alias(obj, "guest-stats-polling-interval",
+ OBJECT(&dev->vdev),
+ "guest-stats-polling-interval");
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_balloon_pci_info = {
+ .base_name = TYPE_VIRTIO_BALLOON_PCI,
+ .generic_name = "virtio-balloon-pci",
+ .transitional_name = "virtio-balloon-pci-transitional",
+ .non_transitional_name = "virtio-balloon-pci-non-transitional",
+ .instance_size = sizeof(VirtIOBalloonPCI),
+ .instance_init = virtio_balloon_pci_instance_init,
+ .class_init = virtio_balloon_pci_class_init,
+};
+
+static void virtio_balloon_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_balloon_pci_info);
+}
+
+type_init(virtio_balloon_pci_register)
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
new file mode 100644
index 00000000..73ac5eb6
--- /dev/null
+++ b/hw/virtio/virtio-balloon.c
@@ -0,0 +1,1079 @@
+/*
+ * Virtio Balloon Device
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright (C) 2011 Red Hat, Inc.
+ * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "qemu/madvise.h"
+#include "hw/virtio/virtio.h"
+#include "hw/mem/pc-dimm.h"
+#include "hw/qdev-properties.h"
+#include "hw/boards.h"
+#include "sysemu/balloon.h"
+#include "hw/virtio/virtio-balloon.h"
+#include "exec/address-spaces.h"
+#include "qapi/error.h"
+#include "qapi/qapi-events-machine.h"
+#include "qapi/visitor.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+
+#define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT)
+
+typedef struct PartiallyBalloonedPage {
+ ram_addr_t base_gpa;
+ unsigned long *bitmap;
+} PartiallyBalloonedPage;
+
+static void virtio_balloon_pbp_free(PartiallyBalloonedPage *pbp)
+{
+ if (!pbp->bitmap) {
+ return;
+ }
+ g_free(pbp->bitmap);
+ pbp->bitmap = NULL;
+}
+
+static void virtio_balloon_pbp_alloc(PartiallyBalloonedPage *pbp,
+ ram_addr_t base_gpa,
+ long subpages)
+{
+ pbp->base_gpa = base_gpa;
+ pbp->bitmap = bitmap_new(subpages);
+}
+
+static bool virtio_balloon_pbp_matches(PartiallyBalloonedPage *pbp,
+ ram_addr_t base_gpa)
+{
+ return pbp->base_gpa == base_gpa;
+}
+
+static bool virtio_balloon_inhibited(void)
+{
+ /*
+ * Postcopy cannot deal with concurrent discards,
+ * so it's special, as well as background snapshots.
+ */
+ return ram_block_discard_is_disabled() || migration_in_incoming_postcopy() ||
+ migration_in_bg_snapshot();
+}
+
+static void balloon_inflate_page(VirtIOBalloon *balloon,
+ MemoryRegion *mr, hwaddr mr_offset,
+ PartiallyBalloonedPage *pbp)
+{
+ void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
+ ram_addr_t rb_offset, rb_aligned_offset, base_gpa;
+ RAMBlock *rb;
+ size_t rb_page_size;
+ int subpages;
+
+ /* XXX is there a better way to get to the RAMBlock than via a
+ * host address? */
+ rb = qemu_ram_block_from_host(addr, false, &rb_offset);
+ rb_page_size = qemu_ram_pagesize(rb);
+
+ if (rb_page_size == BALLOON_PAGE_SIZE) {
+ /* Easy case */
+
+ ram_block_discard_range(rb, rb_offset, rb_page_size);
+ /* We ignore errors from ram_block_discard_range(), because it
+ * has already reported them, and failing to discard a balloon
+ * page is not fatal */
+ return;
+ }
+
+ /* Hard case
+ *
+ * We've put a piece of a larger host page into the balloon - we
+ * need to keep track until we have a whole host page to
+ * discard
+ */
+ warn_report_once(
+"Balloon used with backing page size > 4kiB, this may not be reliable");
+
+ rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size);
+ subpages = rb_page_size / BALLOON_PAGE_SIZE;
+ base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
+ (rb_offset - rb_aligned_offset);
+
+ if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
+ /* We've partially ballooned part of a host page, but now
+ * we're trying to balloon part of a different one. Too hard,
+ * give up on the old partial page */
+ virtio_balloon_pbp_free(pbp);
+ }
+
+ if (!pbp->bitmap) {
+ virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
+ }
+
+ set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
+ pbp->bitmap);
+
+ if (bitmap_full(pbp->bitmap, subpages)) {
+ /* We've accumulated a full host page, we can actually discard
+ * it now */
+
+ ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
+ /* We ignore errors from ram_block_discard_range(), because it
+ * has already reported them, and failing to discard a balloon
+ * page is not fatal */
+ virtio_balloon_pbp_free(pbp);
+ }
+}
+
+static void balloon_deflate_page(VirtIOBalloon *balloon,
+ MemoryRegion *mr, hwaddr mr_offset)
+{
+ void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
+ ram_addr_t rb_offset;
+ RAMBlock *rb;
+ size_t rb_page_size;
+ void *host_addr;
+ int ret;
+
+ /* XXX is there a better way to get to the RAMBlock than via a
+ * host address? */
+ rb = qemu_ram_block_from_host(addr, false, &rb_offset);
+ rb_page_size = qemu_ram_pagesize(rb);
+
+ host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));
+
+ /* When a page is deflated, we hint the whole host page it lives
+ * on, since we can't do anything smaller */
+ ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
+ if (ret != 0) {
+ warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s",
+ strerror(errno));
+ /* Otherwise ignore, failing to page hint shouldn't be fatal */
+ }
+}
+
+static const char *balloon_stat_names[] = {
+ [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
+ [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
+ [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
+ [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
+ [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
+ [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
+ [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
+ [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
+ [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
+ [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
+ [VIRTIO_BALLOON_S_NR] = NULL
+};
+
+/*
+ * reset_stats - Mark all items in the stats array as unset
+ *
+ * This function needs to be called at device initialization and before
+ * updating to a set of newly-generated stats. This will ensure that no
+ * stale values stick around in case the guest reports a subset of the supported
+ * statistics.
+ */
+static inline void reset_stats(VirtIOBalloon *dev)
+{
+ int i;
+ for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1);
+}
+
+static bool balloon_stats_supported(const VirtIOBalloon *s)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+ return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
+}
+
+static bool balloon_stats_enabled(const VirtIOBalloon *s)
+{
+ return s->stats_poll_interval > 0;
+}
+
+static void balloon_stats_destroy_timer(VirtIOBalloon *s)
+{
+ if (balloon_stats_enabled(s)) {
+ timer_free(s->stats_timer);
+ s->stats_timer = NULL;
+ s->stats_poll_interval = 0;
+ }
+}
+
+static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
+{
+ timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
+}
+
+static void balloon_stats_poll_cb(void *opaque)
+{
+ VirtIOBalloon *s = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
+ /* re-schedule */
+ balloon_stats_change_timer(s, s->stats_poll_interval);
+ return;
+ }
+
+ virtqueue_push(s->svq, s->stats_vq_elem, 0);
+ virtio_notify(vdev, s->svq);
+ g_free(s->stats_vq_elem);
+ s->stats_vq_elem = NULL;
+}
+
+static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ Error *err = NULL;
+ VirtIOBalloon *s = VIRTIO_BALLOON(obj);
+ int i;
+
+ if (!visit_start_struct(v, name, NULL, 0, &err)) {
+ goto out;
+ }
+ if (!visit_type_int(v, "last-update", &s->stats_last_update, &err)) {
+ goto out_end;
+ }
+
+ if (!visit_start_struct(v, "stats", NULL, 0, &err)) {
+ goto out_end;
+ }
+ for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+ if (!visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err)) {
+ goto out_nested;
+ }
+ }
+ visit_check_struct(v, &err);
+out_nested:
+ visit_end_struct(v, NULL);
+
+ if (!err) {
+ visit_check_struct(v, &err);
+ }
+out_end:
+ visit_end_struct(v, NULL);
+out:
+ error_propagate(errp, err);
+}
+
+static void balloon_stats_get_poll_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(obj);
+ visit_type_int(v, name, &s->stats_poll_interval, errp);
+}
+
+static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(obj);
+ int64_t value;
+
+ if (!visit_type_int(v, name, &value, errp)) {
+ return;
+ }
+
+ if (value < 0) {
+ error_setg(errp, "timer value must be greater than zero");
+ return;
+ }
+
+ if (value > UINT32_MAX) {
+ error_setg(errp, "timer value is too big");
+ return;
+ }
+
+ if (value == s->stats_poll_interval) {
+ return;
+ }
+
+ if (value == 0) {
+ /* timer=0 disables the timer */
+ balloon_stats_destroy_timer(s);
+ return;
+ }
+
+ if (balloon_stats_enabled(s)) {
+ /* timer interval change */
+ s->stats_poll_interval = value;
+ balloon_stats_change_timer(s, value);
+ return;
+ }
+
+ /* create a new timer */
+ g_assert(s->stats_timer == NULL);
+ s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
+ s->stats_poll_interval = value;
+ balloon_stats_change_timer(s, 0);
+}
+
+static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
+ VirtQueueElement *elem;
+
+ while ((elem = virtqueue_pop(vq, sizeof(VirtQueueElement)))) {
+ unsigned int i;
+
+ /*
+ * When we discard the page it has the effect of removing the page
+ * from the hypervisor itself and causing it to be zeroed when it
+ * is returned to us. So we must not discard the page if it is
+ * accessible by another device or process, or if the guest is
+ * expecting it to retain a non-zero value.
+ */
+ if (virtio_balloon_inhibited() || dev->poison_val) {
+ goto skip_element;
+ }
+
+ for (i = 0; i < elem->in_num; i++) {
+ void *addr = elem->in_sg[i].iov_base;
+ size_t size = elem->in_sg[i].iov_len;
+ ram_addr_t ram_offset;
+ RAMBlock *rb;
+
+ /*
+ * There is no need to check the memory section to see if
+ * it is ram/readonly/romd like there is for handle_output
+ * below. If the region is not meant to be written to then
+ * address_space_map will have allocated a bounce buffer
+ * and it will be freed in address_space_unmap and trigger
+ * and unassigned_mem_write before failing to copy over the
+ * buffer. If more than one bad descriptor is provided it
+ * will return NULL after the first bounce buffer and fail
+ * to map any resources.
+ */
+ rb = qemu_ram_block_from_host(addr, false, &ram_offset);
+ if (!rb) {
+ trace_virtio_balloon_bad_addr(elem->in_addr[i]);
+ continue;
+ }
+
+ /*
+ * For now we will simply ignore unaligned memory regions, or
+ * regions that overrun the end of the RAMBlock.
+ */
+ if (!QEMU_IS_ALIGNED(ram_offset | size, qemu_ram_pagesize(rb)) ||
+ (ram_offset + size) > qemu_ram_get_used_length(rb)) {
+ continue;
+ }
+
+ ram_block_discard_range(rb, ram_offset, size);
+ }
+
+skip_element:
+ virtqueue_push(vq, elem, 0);
+ virtio_notify(vdev, vq);
+ g_free(elem);
+ }
+}
+
+static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+ VirtQueueElement *elem;
+ MemoryRegionSection section;
+
+ for (;;) {
+ PartiallyBalloonedPage pbp = {};
+ size_t offset = 0;
+ uint32_t pfn;
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ break;
+ }
+
+ while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
+ unsigned int p = virtio_ldl_p(vdev, &pfn);
+ hwaddr pa;
+
+ pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
+ offset += 4;
+
+ section = memory_region_find(get_system_memory(), pa,
+ BALLOON_PAGE_SIZE);
+ if (!section.mr) {
+ trace_virtio_balloon_bad_addr(pa);
+ continue;
+ }
+ if (!memory_region_is_ram(section.mr) ||
+ memory_region_is_rom(section.mr) ||
+ memory_region_is_romd(section.mr)) {
+ trace_virtio_balloon_bad_addr(pa);
+ memory_region_unref(section.mr);
+ continue;
+ }
+
+ trace_virtio_balloon_handle_output(memory_region_name(section.mr),
+ pa);
+ if (!virtio_balloon_inhibited()) {
+ if (vq == s->ivq) {
+ balloon_inflate_page(s, section.mr,
+ section.offset_within_region, &pbp);
+ } else if (vq == s->dvq) {
+ balloon_deflate_page(s, section.mr, section.offset_within_region);
+ } else {
+ g_assert_not_reached();
+ }
+ }
+ memory_region_unref(section.mr);
+ }
+
+ virtqueue_push(vq, elem, 0);
+ virtio_notify(vdev, vq);
+ g_free(elem);
+ virtio_balloon_pbp_free(&pbp);
+ }
+}
+
+static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+ VirtQueueElement *elem;
+ VirtIOBalloonStat stat;
+ size_t offset = 0;
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ goto out;
+ }
+
+ if (s->stats_vq_elem != NULL) {
+ /* This should never happen if the driver follows the spec. */
+ virtqueue_push(vq, s->stats_vq_elem, 0);
+ virtio_notify(vdev, vq);
+ g_free(s->stats_vq_elem);
+ }
+
+ s->stats_vq_elem = elem;
+
+ /* Initialize the stats to get rid of any stale values. This is only
+ * needed to handle the case where a guest supports fewer stats than it
+ * used to (ie. it has booted into an old kernel).
+ */
+ reset_stats(s);
+
+ while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat))
+ == sizeof(stat)) {
+ uint16_t tag = virtio_tswap16(vdev, stat.tag);
+ uint64_t val = virtio_tswap64(vdev, stat.val);
+
+ offset += sizeof(stat);
+ if (tag < VIRTIO_BALLOON_S_NR)
+ s->stats[tag] = val;
+ }
+ s->stats_vq_offset = offset;
+ s->stats_last_update = g_get_real_time() / G_USEC_PER_SEC;
+
+out:
+ if (balloon_stats_enabled(s)) {
+ balloon_stats_change_timer(s, s->stats_poll_interval);
+ }
+}
+
+static void virtio_balloon_handle_free_page_vq(VirtIODevice *vdev,
+ VirtQueue *vq)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+ qemu_bh_schedule(s->free_page_bh);
+}
+
+static bool get_free_page_hints(VirtIOBalloon *dev)
+{
+ VirtQueueElement *elem;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtQueue *vq = dev->free_page_vq;
+ bool ret = true;
+ int i;
+
+ while (dev->block_iothread) {
+ qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
+ }
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ return false;
+ }
+
+ if (elem->out_num) {
+ uint32_t id;
+ size_t size = iov_to_buf(elem->out_sg, elem->out_num, 0,
+ &id, sizeof(id));
+
+ virtio_tswap32s(vdev, &id);
+ if (unlikely(size != sizeof(id))) {
+ virtio_error(vdev, "received an incorrect cmd id");
+ ret = false;
+ goto out;
+ }
+ if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED &&
+ id == dev->free_page_hint_cmd_id) {
+ dev->free_page_hint_status = FREE_PAGE_HINT_S_START;
+ } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_START) {
+ /*
+ * Stop the optimization only when it has started. This
+ * avoids a stale stop sign for the previous command.
+ */
+ dev->free_page_hint_status = FREE_PAGE_HINT_S_STOP;
+ }
+ }
+
+ if (elem->in_num && dev->free_page_hint_status == FREE_PAGE_HINT_S_START) {
+ for (i = 0; i < elem->in_num; i++) {
+ qemu_guest_free_page_hint(elem->in_sg[i].iov_base,
+ elem->in_sg[i].iov_len);
+ }
+ }
+
+out:
+ virtqueue_push(vq, elem, 0);
+ g_free(elem);
+ return ret;
+}
+
+static void virtio_ballloon_get_free_page_hints(void *opaque)
+{
+ VirtIOBalloon *dev = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtQueue *vq = dev->free_page_vq;
+ bool continue_to_get_hints;
+
+ do {
+ qemu_mutex_lock(&dev->free_page_lock);
+ virtio_queue_set_notification(vq, 0);
+ continue_to_get_hints = get_free_page_hints(dev);
+ qemu_mutex_unlock(&dev->free_page_lock);
+ virtio_notify(vdev, vq);
+ /*
+ * Start to poll the vq once the hinting started. Otherwise, continue
+ * only when there are entries on the vq, which need to be given back.
+ */
+ } while (continue_to_get_hints ||
+ dev->free_page_hint_status == FREE_PAGE_HINT_S_START);
+ virtio_queue_set_notification(vq, 1);
+}
+
+static bool virtio_balloon_free_page_support(void *opaque)
+{
+ VirtIOBalloon *s = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+}
+
+static void virtio_balloon_free_page_start(VirtIOBalloon *s)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ qemu_mutex_lock(&s->free_page_lock);
+
+ if (s->free_page_hint_cmd_id == UINT_MAX) {
+ s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN;
+ } else {
+ s->free_page_hint_cmd_id++;
+ }
+
+ s->free_page_hint_status = FREE_PAGE_HINT_S_REQUESTED;
+ qemu_mutex_unlock(&s->free_page_lock);
+
+ virtio_notify_config(vdev);
+}
+
+static void virtio_balloon_free_page_stop(VirtIOBalloon *s)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ if (s->free_page_hint_status != FREE_PAGE_HINT_S_STOP) {
+ /*
+ * The lock also guarantees us that the
+ * virtio_ballloon_get_free_page_hints exits after the
+ * free_page_hint_status is set to S_STOP.
+ */
+ qemu_mutex_lock(&s->free_page_lock);
+ /*
+ * The guest isn't done hinting, so send a notification
+ * to the guest to actively stop the hinting.
+ */
+ s->free_page_hint_status = FREE_PAGE_HINT_S_STOP;
+ qemu_mutex_unlock(&s->free_page_lock);
+ virtio_notify_config(vdev);
+ }
+}
+
+static void virtio_balloon_free_page_done(VirtIOBalloon *s)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ if (s->free_page_hint_status != FREE_PAGE_HINT_S_DONE) {
+ /* See virtio_balloon_free_page_stop() */
+ qemu_mutex_lock(&s->free_page_lock);
+ s->free_page_hint_status = FREE_PAGE_HINT_S_DONE;
+ qemu_mutex_unlock(&s->free_page_lock);
+ virtio_notify_config(vdev);
+ }
+}
+
+static int
+virtio_balloon_free_page_hint_notify(NotifierWithReturn *n, void *data)
+{
+ VirtIOBalloon *dev = container_of(n, VirtIOBalloon, free_page_hint_notify);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ PrecopyNotifyData *pnd = data;
+
+ if (!virtio_balloon_free_page_support(dev)) {
+ /*
+ * This is an optimization provided to migration, so just return 0 to
+ * have the normal migration process not affected when this feature is
+ * not supported.
+ */
+ return 0;
+ }
+
+ /*
+ * Pages hinted via qemu_guest_free_page_hint() are cleared from the dirty
+ * bitmap and will not get migrated, especially also not when the postcopy
+ * destination starts using them and requests migration from the source; the
+ * faulting thread will stall until postcopy migration finishes and
+ * all threads are woken up. Let's not start free page hinting if postcopy
+ * is possible.
+ */
+ if (migrate_postcopy_ram()) {
+ return 0;
+ }
+
+ switch (pnd->reason) {
+ case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC:
+ virtio_balloon_free_page_stop(dev);
+ break;
+ case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
+ if (vdev->vm_running) {
+ virtio_balloon_free_page_start(dev);
+ break;
+ }
+ /*
+ * Set S_DONE before migrating the vmstate, so the guest will reuse
+ * all hinted pages once running on the destination. Fall through.
+ */
+ case PRECOPY_NOTIFY_CLEANUP:
+ /*
+ * Especially, if something goes wrong during precopy or if migration
+ * is canceled, we have to properly communicate S_DONE to the VM.
+ */
+ virtio_balloon_free_page_done(dev);
+ break;
+ case PRECOPY_NOTIFY_SETUP:
+ case PRECOPY_NOTIFY_COMPLETE:
+ break;
+ default:
+ virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason);
+ }
+
+ return 0;
+}
+
+static size_t virtio_balloon_config_size(VirtIOBalloon *s)
+{
+ uint64_t features = s->host_features;
+
+ if (s->qemu_4_0_config_size) {
+ return sizeof(struct virtio_balloon_config);
+ }
+ if (virtio_has_feature(features, VIRTIO_BALLOON_F_PAGE_POISON)) {
+ return sizeof(struct virtio_balloon_config);
+ }
+ if (virtio_has_feature(features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ return offsetof(struct virtio_balloon_config, poison_val);
+ }
+ return offsetof(struct virtio_balloon_config, free_page_hint_cmd_id);
+}
+
+static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+ VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
+ struct virtio_balloon_config config = {};
+
+ config.num_pages = cpu_to_le32(dev->num_pages);
+ config.actual = cpu_to_le32(dev->actual);
+ config.poison_val = cpu_to_le32(dev->poison_val);
+
+ if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED) {
+ config.free_page_hint_cmd_id =
+ cpu_to_le32(dev->free_page_hint_cmd_id);
+ } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_STOP) {
+ config.free_page_hint_cmd_id =
+ cpu_to_le32(VIRTIO_BALLOON_CMD_ID_STOP);
+ } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_DONE) {
+ config.free_page_hint_cmd_id =
+ cpu_to_le32(VIRTIO_BALLOON_CMD_ID_DONE);
+ }
+
+ trace_virtio_balloon_get_config(config.num_pages, config.actual);
+ memcpy(config_data, &config, virtio_balloon_config_size(dev));
+}
+
+static int build_dimm_list(Object *obj, void *opaque)
+{
+ GSList **list = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+ DeviceState *dev = DEVICE(obj);
+ if (dev->realized) { /* only realized DIMMs matter */
+ *list = g_slist_prepend(*list, dev);
+ }
+ }
+
+ object_child_foreach(obj, build_dimm_list, opaque);
+ return 0;
+}
+
+static ram_addr_t get_current_ram_size(void)
+{
+ GSList *list = NULL, *item;
+ ram_addr_t size = current_machine->ram_size;
+
+ build_dimm_list(qdev_get_machine(), &list);
+ for (item = list; item; item = g_slist_next(item)) {
+ Object *obj = OBJECT(item->data);
+ if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) {
+ size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+ &error_abort);
+ }
+ }
+ g_slist_free(list);
+
+ return size;
+}
+
+static bool virtio_balloon_page_poison_support(void *opaque)
+{
+ VirtIOBalloon *s = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+ return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+}
+
+static void virtio_balloon_set_config(VirtIODevice *vdev,
+ const uint8_t *config_data)
+{
+ VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
+ struct virtio_balloon_config config;
+ uint32_t oldactual = dev->actual;
+ ram_addr_t vm_ram_size = get_current_ram_size();
+
+ memcpy(&config, config_data, virtio_balloon_config_size(dev));
+ dev->actual = le32_to_cpu(config.actual);
+ if (dev->actual != oldactual) {
+ qapi_event_send_balloon_change(vm_ram_size -
+ ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
+ }
+ dev->poison_val = 0;
+ if (virtio_balloon_page_poison_support(dev)) {
+ dev->poison_val = le32_to_cpu(config.poison_val);
+ }
+ trace_virtio_balloon_set_config(dev->actual, oldactual);
+}
+
+static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
+ Error **errp)
+{
+ VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
+ f |= dev->host_features;
+ virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
+
+ return f;
+}
+
+static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
+{
+ VirtIOBalloon *dev = opaque;
+ info->actual = get_current_ram_size() - ((uint64_t) dev->actual <<
+ VIRTIO_BALLOON_PFN_SHIFT);
+}
+
+static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
+{
+ VirtIOBalloon *dev = VIRTIO_BALLOON(opaque);
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ ram_addr_t vm_ram_size = get_current_ram_size();
+
+ if (target > vm_ram_size) {
+ target = vm_ram_size;
+ }
+ if (target) {
+ dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
+ virtio_notify_config(vdev);
+ }
+ trace_virtio_balloon_to_target(target, dev->num_pages);
+}
+
+static int virtio_balloon_post_load_device(void *opaque, int version_id)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(opaque);
+
+ if (balloon_stats_enabled(s)) {
+ balloon_stats_change_timer(s, s->stats_poll_interval);
+ }
+ return 0;
+}
+
+static const VMStateDescription vmstate_virtio_balloon_free_page_hint = {
+ .name = "virtio-balloon-device/free-page-report",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = virtio_balloon_free_page_support,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(free_page_hint_cmd_id, VirtIOBalloon),
+ VMSTATE_UINT32(free_page_hint_status, VirtIOBalloon),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_balloon_page_poison = {
+ .name = "virtio-balloon-device/page-poison",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = virtio_balloon_page_poison_support,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(poison_val, VirtIOBalloon),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_balloon_device = {
+ .name = "virtio-balloon-device",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = virtio_balloon_post_load_device,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(num_pages, VirtIOBalloon),
+ VMSTATE_UINT32(actual, VirtIOBalloon),
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &vmstate_virtio_balloon_free_page_hint,
+ &vmstate_virtio_balloon_page_poison,
+ NULL
+ }
+};
+
+static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOBalloon *s = VIRTIO_BALLOON(dev);
+ int ret;
+
+ virtio_init(vdev, VIRTIO_ID_BALLOON, virtio_balloon_config_size(s));
+
+ ret = qemu_add_balloon_handler(virtio_balloon_to_target,
+ virtio_balloon_stat, s);
+
+ if (ret < 0) {
+ error_setg(errp, "Only one balloon device is supported");
+ virtio_cleanup(vdev);
+ return;
+ }
+
+ if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
+ !s->iothread) {
+ error_setg(errp, "'free-page-hint' requires 'iothread' to be set");
+ virtio_cleanup(vdev);
+ return;
+ }
+
+ s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
+ s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
+ s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
+
+ if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+ s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE,
+ virtio_balloon_handle_free_page_vq);
+ precopy_add_notifier(&s->free_page_hint_notify);
+
+ object_ref(OBJECT(s->iothread));
+ s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
+ virtio_ballloon_get_free_page_hints, s);
+ }
+
+ if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) {
+ s->reporting_vq = virtio_add_queue(vdev, 32,
+ virtio_balloon_handle_report);
+ }
+
+ reset_stats(s);
+}
+
+static void virtio_balloon_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOBalloon *s = VIRTIO_BALLOON(dev);
+
+ if (s->free_page_bh) {
+ qemu_bh_delete(s->free_page_bh);
+ object_unref(OBJECT(s->iothread));
+ virtio_balloon_free_page_stop(s);
+ precopy_remove_notifier(&s->free_page_hint_notify);
+ }
+ balloon_stats_destroy_timer(s);
+ qemu_remove_balloon_handler(s);
+
+ virtio_delete_queue(s->ivq);
+ virtio_delete_queue(s->dvq);
+ virtio_delete_queue(s->svq);
+ if (s->free_page_vq) {
+ virtio_delete_queue(s->free_page_vq);
+ }
+ if (s->reporting_vq) {
+ virtio_delete_queue(s->reporting_vq);
+ }
+ virtio_cleanup(vdev);
+}
+
+static void virtio_balloon_device_reset(VirtIODevice *vdev)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+
+ if (virtio_balloon_free_page_support(s)) {
+ virtio_balloon_free_page_stop(s);
+ }
+
+ if (s->stats_vq_elem != NULL) {
+ virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
+ g_free(s->stats_vq_elem);
+ s->stats_vq_elem = NULL;
+ }
+
+ s->poison_val = 0;
+}
+
+static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+
+ if (!s->stats_vq_elem && vdev->vm_running &&
+ (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
+ /* poll stats queue for the element we have discarded when the VM
+ * was stopped */
+ virtio_balloon_receive_stats(vdev, s->svq);
+ }
+
+ if (virtio_balloon_free_page_support(s)) {
+ /*
+ * The VM is woken up and the iothread was blocked, so signal it to
+ * continue.
+ */
+ if (vdev->vm_running && s->block_iothread) {
+ qemu_mutex_lock(&s->free_page_lock);
+ s->block_iothread = false;
+ qemu_cond_signal(&s->free_page_cond);
+ qemu_mutex_unlock(&s->free_page_lock);
+ }
+
+ /* The VM is stopped, block the iothread. */
+ if (!vdev->vm_running) {
+ qemu_mutex_lock(&s->free_page_lock);
+ s->block_iothread = true;
+ qemu_mutex_unlock(&s->free_page_lock);
+ }
+ }
+}
+
+static void virtio_balloon_instance_init(Object *obj)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(obj);
+
+ qemu_mutex_init(&s->free_page_lock);
+ qemu_cond_init(&s->free_page_cond);
+ s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN;
+ s->free_page_hint_notify.notify = virtio_balloon_free_page_hint_notify;
+
+ object_property_add(obj, "guest-stats", "guest statistics",
+ balloon_stats_get_all, NULL, NULL, NULL);
+
+ object_property_add(obj, "guest-stats-polling-interval", "int",
+ balloon_stats_get_poll_interval,
+ balloon_stats_set_poll_interval,
+ NULL, NULL);
+}
+
+static const VMStateDescription vmstate_virtio_balloon = {
+ .name = "virtio-balloon",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property virtio_balloon_properties[] = {
+ DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
+ VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
+ DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
+ VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
+ DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features,
+ VIRTIO_BALLOON_F_PAGE_POISON, true),
+ DEFINE_PROP_BIT("free-page-reporting", VirtIOBalloon, host_features,
+ VIRTIO_BALLOON_F_REPORTING, false),
+ /* QEMU 4.0 accidentally changed the config size even when free-page-hint
+ * is disabled, resulting in QEMU 3.1 migration incompatibility. This
+ * property retains this quirk for QEMU 4.1 machine types.
+ */
+ DEFINE_PROP_BOOL("qemu-4-0-config-size", VirtIOBalloon,
+ qemu_4_0_config_size, false),
+ DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD,
+ IOThread *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_balloon_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_balloon_properties);
+ dc->vmsd = &vmstate_virtio_balloon;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_balloon_device_realize;
+ vdc->unrealize = virtio_balloon_device_unrealize;
+ vdc->reset = virtio_balloon_device_reset;
+ vdc->get_config = virtio_balloon_get_config;
+ vdc->set_config = virtio_balloon_set_config;
+ vdc->get_features = virtio_balloon_get_features;
+ vdc->set_status = virtio_balloon_set_status;
+ vdc->vmsd = &vmstate_virtio_balloon_device;
+}
+
+static const TypeInfo virtio_balloon_info = {
+ .name = TYPE_VIRTIO_BALLOON,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOBalloon),
+ .instance_init = virtio_balloon_instance_init,
+ .class_init = virtio_balloon_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_balloon_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-blk-pci.c b/hw/virtio/virtio-blk-pci.c
new file mode 100644
index 00000000..9743bee9
--- /dev/null
+++ b/hw/virtio/virtio-blk-pci.c
@@ -0,0 +1,107 @@
+/*
+ * Virtio blk PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-blk.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIOBlkPCI VirtIOBlkPCI;
+
+/*
+ * virtio-blk-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_BLK_PCI "virtio-blk-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOBlkPCI, VIRTIO_BLK_PCI,
+ TYPE_VIRTIO_BLK_PCI)
+
+struct VirtIOBlkPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOBlock vdev;
+};
+
+static Property virtio_blk_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIOBlkConf *conf = &dev->vdev.conf;
+
+ if (conf->num_queues == VIRTIO_BLK_AUTO_NUM_QUEUES) {
+ conf->num_queues = virtio_pci_optimal_num_queues(0);
+ }
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = conf->num_queues + 1;
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_blk_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, virtio_blk_pci_properties);
+ k->realize = virtio_blk_pci_realize;
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void virtio_blk_pci_instance_init(Object *obj)
+{
+ VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_BLK);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_blk_pci_info = {
+ .base_name = TYPE_VIRTIO_BLK_PCI,
+ .generic_name = "virtio-blk-pci",
+ .transitional_name = "virtio-blk-pci-transitional",
+ .non_transitional_name = "virtio-blk-pci-non-transitional",
+ .instance_size = sizeof(VirtIOBlkPCI),
+ .instance_init = virtio_blk_pci_instance_init,
+ .class_init = virtio_blk_pci_class_init,
+};
+
+static void virtio_blk_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_blk_pci_info);
+}
+
+type_init(virtio_blk_pci_register)
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
new file mode 100644
index 00000000..896feb37
--- /dev/null
+++ b/hw/virtio/virtio-bus.c
@@ -0,0 +1,372 @@
+/*
+ * VirtioBus
+ *
+ * Copyright (C) 2012 : GreenSocs Ltd
+ * http://www.greensocs.com/ , email: info@greensocs.com
+ *
+ * Developed by :
+ * Frederic Konrad <fred.konrad@greensocs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio.h"
+#include "exec/address-spaces.h"
+
+/* #define DEBUG_VIRTIO_BUS */
+
+#ifdef DEBUG_VIRTIO_BUS
+#define DPRINTF(fmt, ...) \
+do { printf("virtio_bus: " fmt , ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+/* A VirtIODevice is being plugged */
+void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp)
+{
+ DeviceState *qdev = DEVICE(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(qdev));
+ VirtioBusState *bus = VIRTIO_BUS(qbus);
+ VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(bus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ bool has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+ bool vdev_has_iommu;
+ Error *local_err = NULL;
+
+ DPRINTF("%s: plug device.\n", qbus->name);
+
+ if (klass->pre_plugged != NULL) {
+ klass->pre_plugged(qbus->parent, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+
+ /* Get the features of the plugged device. */
+ assert(vdc->get_features != NULL);
+ vdev->host_features = vdc->get_features(vdev, vdev->host_features,
+ &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ if (klass->device_plugged != NULL) {
+ klass->device_plugged(qbus->parent, &local_err);
+ }
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ vdev->dma_as = &address_space_memory;
+ if (has_iommu) {
+ vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+ /*
+ * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and
+ * device operational. If the driver does not accept IOMMU_PLATFORM
+ * we fail the device.
+ */
+ virtio_add_feature(&vdev->host_features, VIRTIO_F_IOMMU_PLATFORM);
+ if (klass->get_dma_as) {
+ vdev->dma_as = klass->get_dma_as(qbus->parent);
+ if (!vdev_has_iommu && vdev->dma_as != &address_space_memory) {
+ error_setg(errp,
+ "iommu_platform=true is not supported by the device");
+ return;
+ }
+ }
+ }
+}
+
+/* Reset the virtio_bus */
+void virtio_bus_reset(VirtioBusState *bus)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+
+ DPRINTF("%s: reset device.\n", BUS(bus)->name);
+ virtio_bus_stop_ioeventfd(bus);
+ if (vdev != NULL) {
+ virtio_reset(vdev);
+ }
+}
+
+/* A VirtIODevice is being unplugged */
+void virtio_bus_device_unplugged(VirtIODevice *vdev)
+{
+ DeviceState *qdev = DEVICE(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(qdev));
+ VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(qbus);
+
+ DPRINTF("%s: remove device.\n", qbus->name);
+
+ if (vdev != NULL) {
+ if (klass->device_unplugged != NULL) {
+ klass->device_unplugged(qbus->parent);
+ }
+ }
+}
+
+/* Get the device id of the plugged device. */
+uint16_t virtio_bus_get_vdev_id(VirtioBusState *bus)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ assert(vdev != NULL);
+ return vdev->device_id;
+}
+
+/* Get the config_len field of the plugged device. */
+size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ assert(vdev != NULL);
+ return vdev->config_len;
+}
+
+/* Get bad features of the plugged device. */
+uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtioDeviceClass *k;
+
+ assert(vdev != NULL);
+ k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ if (k->bad_features != NULL) {
+ return k->bad_features(vdev);
+ } else {
+ return 0;
+ }
+}
+
+/* Get config of the plugged device. */
+void virtio_bus_get_vdev_config(VirtioBusState *bus, uint8_t *config)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtioDeviceClass *k;
+
+ assert(vdev != NULL);
+ k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ if (k->get_config != NULL) {
+ k->get_config(vdev, config);
+ }
+}
+
+/* Set config of the plugged device. */
+void virtio_bus_set_vdev_config(VirtioBusState *bus, uint8_t *config)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtioDeviceClass *k;
+
+ assert(vdev != NULL);
+ k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ if (k->set_config != NULL) {
+ k->set_config(vdev, config);
+ }
+}
+
+/* On success, ioeventfd ownership belongs to the caller. */
+int virtio_bus_grab_ioeventfd(VirtioBusState *bus)
+{
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+
+ /* vhost can be used even if ioeventfd=off in the proxy device,
+ * so do not check k->ioeventfd_enabled.
+ */
+ if (!k->ioeventfd_assign) {
+ return -ENOSYS;
+ }
+
+ if (bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) {
+ virtio_bus_stop_ioeventfd(bus);
+ /* Remember that we need to restart ioeventfd
+ * when ioeventfd_grabbed becomes zero.
+ */
+ bus->ioeventfd_started = true;
+ }
+ bus->ioeventfd_grabbed++;
+ return 0;
+}
+
+void virtio_bus_release_ioeventfd(VirtioBusState *bus)
+{
+ assert(bus->ioeventfd_grabbed != 0);
+ if (--bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) {
+ /* Force virtio_bus_start_ioeventfd to act. */
+ bus->ioeventfd_started = false;
+ virtio_bus_start_ioeventfd(bus);
+ }
+}
+
+int virtio_bus_start_ioeventfd(VirtioBusState *bus)
+{
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+ DeviceState *proxy = DEVICE(BUS(bus)->parent);
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ int r;
+
+ if (!k->ioeventfd_assign || !k->ioeventfd_enabled(proxy)) {
+ return -ENOSYS;
+ }
+ if (bus->ioeventfd_started) {
+ return 0;
+ }
+
+ /* Only set our notifier if we have ownership. */
+ if (!bus->ioeventfd_grabbed) {
+ r = vdc->start_ioeventfd(vdev);
+ if (r < 0) {
+ error_report("%s: failed. Fallback to userspace (slower).", __func__);
+ return r;
+ }
+ }
+ bus->ioeventfd_started = true;
+ return 0;
+}
+
+void virtio_bus_stop_ioeventfd(VirtioBusState *bus)
+{
+ VirtIODevice *vdev;
+ VirtioDeviceClass *vdc;
+
+ if (!bus->ioeventfd_started) {
+ return;
+ }
+
+ /* Only remove our notifier if we have ownership. */
+ if (!bus->ioeventfd_grabbed) {
+ vdev = virtio_bus_get_device(bus);
+ vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ vdc->stop_ioeventfd(vdev);
+ }
+ bus->ioeventfd_started = false;
+}
+
+bool virtio_bus_ioeventfd_enabled(VirtioBusState *bus)
+{
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+ DeviceState *proxy = DEVICE(BUS(bus)->parent);
+
+ return k->ioeventfd_assign && k->ioeventfd_enabled(proxy);
+}
+
+/*
+ * This function switches ioeventfd on/off in the device.
+ * The caller must set or clear the handlers for the EventNotifier.
+ */
+int virtio_bus_set_host_notifier(VirtioBusState *bus, int n, bool assign)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+ DeviceState *proxy = DEVICE(BUS(bus)->parent);
+ VirtQueue *vq = virtio_get_queue(vdev, n);
+ EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+ int r = 0;
+
+ if (!k->ioeventfd_assign) {
+ return -ENOSYS;
+ }
+
+ if (assign) {
+ r = event_notifier_init(notifier, 1);
+ if (r < 0) {
+ error_report("%s: unable to init event notifier: %s (%d)",
+ __func__, strerror(-r), r);
+ return r;
+ }
+ r = k->ioeventfd_assign(proxy, notifier, n, true);
+ if (r < 0) {
+ error_report("%s: unable to assign ioeventfd: %d", __func__, r);
+ virtio_bus_cleanup_host_notifier(bus, n);
+ }
+ } else {
+ k->ioeventfd_assign(proxy, notifier, n, false);
+ }
+
+ if (r == 0) {
+ virtio_queue_set_host_notifier_enabled(vq, assign);
+ }
+
+ return r;
+}
+
+void virtio_bus_cleanup_host_notifier(VirtioBusState *bus, int n)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(bus);
+ VirtQueue *vq = virtio_get_queue(vdev, n);
+ EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
+
+ /* Test and clear notifier after disabling event,
+ * in case poll callback didn't have time to run.
+ */
+ virtio_queue_host_notifier_read(notifier);
+ event_notifier_cleanup(notifier);
+}
+
+static char *virtio_bus_get_dev_path(DeviceState *dev)
+{
+ BusState *bus = qdev_get_parent_bus(dev);
+ DeviceState *proxy = DEVICE(bus->parent);
+ return qdev_get_dev_path(proxy);
+}
+
+static char *virtio_bus_get_fw_dev_path(DeviceState *dev)
+{
+ return NULL;
+}
+
+bool virtio_bus_device_iommu_enabled(VirtIODevice *vdev)
+{
+ DeviceState *qdev = DEVICE(vdev);
+ BusState *qbus = BUS(qdev_get_parent_bus(qdev));
+ VirtioBusState *bus = VIRTIO_BUS(qbus);
+ VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(bus);
+
+ if (!klass->iommu_enabled) {
+ return false;
+ }
+
+ return klass->iommu_enabled(qbus->parent);
+}
+
+static void virtio_bus_class_init(ObjectClass *klass, void *data)
+{
+ BusClass *bus_class = BUS_CLASS(klass);
+ bus_class->get_dev_path = virtio_bus_get_dev_path;
+ bus_class->get_fw_dev_path = virtio_bus_get_fw_dev_path;
+}
+
+static const TypeInfo virtio_bus_info = {
+ .name = TYPE_VIRTIO_BUS,
+ .parent = TYPE_BUS,
+ .instance_size = sizeof(VirtioBusState),
+ .abstract = true,
+ .class_size = sizeof(VirtioBusClass),
+ .class_init = virtio_bus_class_init
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_bus_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c
new file mode 100644
index 00000000..0783dc2f
--- /dev/null
+++ b/hw/virtio/virtio-crypto-pci.c
@@ -0,0 +1,94 @@
+/*
+ * Virtio crypto device
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ * Gonglei <arei.gonglei@huawei.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "hw/pci/pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-crypto.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
+
+/*
+ * virtio-crypto-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_CRYPTO_PCI "virtio-crypto-pci"
+DECLARE_INSTANCE_CHECKER(VirtIOCryptoPCI, VIRTIO_CRYPTO_PCI,
+ TYPE_VIRTIO_CRYPTO_PCI)
+
+struct VirtIOCryptoPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOCrypto vdev;
+};
+
+static Property virtio_crypto_pci_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_crypto_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOCryptoPCI *vcrypto = VIRTIO_CRYPTO_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&vcrypto->vdev);
+
+ if (vcrypto->vdev.conf.cryptodev == NULL) {
+ error_setg(errp, "'cryptodev' parameter expects a valid object");
+ return;
+ }
+
+ virtio_pci_force_virtio_1(vpci_dev);
+ if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) {
+ return;
+ }
+}
+
+static void virtio_crypto_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = virtio_crypto_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, virtio_crypto_pci_properties);
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void virtio_crypto_initfn(Object *obj)
+{
+ VirtIOCryptoPCI *dev = VIRTIO_CRYPTO_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_CRYPTO);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_crypto_pci_info = {
+ .generic_name = TYPE_VIRTIO_CRYPTO_PCI,
+ .instance_size = sizeof(VirtIOCryptoPCI),
+ .instance_init = virtio_crypto_initfn,
+ .class_init = virtio_crypto_pci_class_init,
+};
+
+static void virtio_crypto_pci_register_types(void)
+{
+ virtio_pci_types_register(&virtio_crypto_pci_info);
+}
+type_init(virtio_crypto_pci_register_types)
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
new file mode 100644
index 00000000..97da74e7
--- /dev/null
+++ b/hw/virtio/virtio-crypto.c
@@ -0,0 +1,1250 @@
+/*
+ * Virtio crypto Support
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ *
+ * Authors:
+ * Gonglei <arei.gonglei@huawei.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-crypto.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-access.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "sysemu/cryptodev-vhost.h"
+
+#define VIRTIO_CRYPTO_VM_VERSION 1
+
+typedef struct VirtIOCryptoSessionReq {
+ VirtIODevice *vdev;
+ VirtQueue *vq;
+ VirtQueueElement *elem;
+ CryptoDevBackendSessionInfo info;
+ CryptoDevCompletionFunc cb;
+} VirtIOCryptoSessionReq;
+
+static void virtio_crypto_free_create_session_req(VirtIOCryptoSessionReq *sreq)
+{
+ switch (sreq->info.op_code) {
+ case VIRTIO_CRYPTO_CIPHER_CREATE_SESSION:
+ g_free(sreq->info.u.sym_sess_info.cipher_key);
+ g_free(sreq->info.u.sym_sess_info.auth_key);
+ break;
+
+ case VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION:
+ g_free(sreq->info.u.asym_sess_info.key);
+ break;
+
+ case VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_HASH_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_MAC_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_AEAD_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_AKCIPHER_DESTROY_SESSION:
+ break;
+
+ default:
+ error_report("Unknown opcode: %u", sreq->info.op_code);
+ }
+ g_free(sreq);
+}
+
+/*
+ * Transfer virtqueue index to crypto queue index.
+ * The control virtqueue is after the data virtqueues
+ * so the input value doesn't need to be adjusted
+ */
+static inline int virtio_crypto_vq2q(int queue_index)
+{
+ return queue_index;
+}
+
+static int
+virtio_crypto_cipher_session_helper(VirtIODevice *vdev,
+ CryptoDevBackendSymSessionInfo *info,
+ struct virtio_crypto_cipher_session_para *cipher_para,
+ struct iovec **iov, unsigned int *out_num)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ unsigned int num = *out_num;
+
+ info->cipher_alg = ldl_le_p(&cipher_para->algo);
+ info->key_len = ldl_le_p(&cipher_para->keylen);
+ info->direction = ldl_le_p(&cipher_para->op);
+ DPRINTF("cipher_alg=%" PRIu32 ", info->direction=%" PRIu32 "\n",
+ info->cipher_alg, info->direction);
+
+ if (info->key_len > vcrypto->conf.max_cipher_key_len) {
+ error_report("virtio-crypto length of cipher key is too big: %u",
+ info->key_len);
+ return -VIRTIO_CRYPTO_ERR;
+ }
+ /* Get cipher key */
+ if (info->key_len > 0) {
+ size_t s;
+ DPRINTF("keylen=%" PRIu32 "\n", info->key_len);
+
+ info->cipher_key = g_malloc(info->key_len);
+ s = iov_to_buf(*iov, num, 0, info->cipher_key, info->key_len);
+ if (unlikely(s != info->key_len)) {
+ virtio_error(vdev, "virtio-crypto cipher key incorrect");
+ return -EFAULT;
+ }
+ iov_discard_front(iov, &num, info->key_len);
+ *out_num = num;
+ }
+
+ return 0;
+}
+
+static int
+virtio_crypto_create_sym_session(VirtIOCrypto *vcrypto,
+ struct virtio_crypto_sym_create_session_req *sess_req,
+ uint32_t queue_id,
+ uint32_t opcode,
+ struct iovec *iov, unsigned int out_num,
+ VirtIOCryptoSessionReq *sreq)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ CryptoDevBackendSymSessionInfo *sym_info = &sreq->info.u.sym_sess_info;
+ int queue_index;
+ uint32_t op_type;
+ int ret;
+
+ op_type = ldl_le_p(&sess_req->op_type);
+ sreq->info.op_code = opcode;
+
+ sym_info = &sreq->info.u.sym_sess_info;
+ sym_info->op_type = op_type;
+
+ if (op_type == VIRTIO_CRYPTO_SYM_OP_CIPHER) {
+ ret = virtio_crypto_cipher_session_helper(vdev, sym_info,
+ &sess_req->u.cipher.para,
+ &iov, &out_num);
+ if (ret < 0) {
+ return ret;
+ }
+ } else if (op_type == VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) {
+ size_t s;
+ /* cipher part */
+ ret = virtio_crypto_cipher_session_helper(vdev, sym_info,
+ &sess_req->u.chain.para.cipher_param,
+ &iov, &out_num);
+ if (ret < 0) {
+ return ret;
+ }
+ /* hash part */
+ sym_info->alg_chain_order = ldl_le_p(
+ &sess_req->u.chain.para.alg_chain_order);
+ sym_info->add_len = ldl_le_p(&sess_req->u.chain.para.aad_len);
+ sym_info->hash_mode = ldl_le_p(&sess_req->u.chain.para.hash_mode);
+ if (sym_info->hash_mode == VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH) {
+ sym_info->hash_alg =
+ ldl_le_p(&sess_req->u.chain.para.u.mac_param.algo);
+ sym_info->auth_key_len = ldl_le_p(
+ &sess_req->u.chain.para.u.mac_param.auth_key_len);
+ sym_info->hash_result_len = ldl_le_p(
+ &sess_req->u.chain.para.u.mac_param.hash_result_len);
+ if (sym_info->auth_key_len > vcrypto->conf.max_auth_key_len) {
+ error_report("virtio-crypto length of auth key is too big: %u",
+ sym_info->auth_key_len);
+ return -VIRTIO_CRYPTO_ERR;
+ }
+ /* get auth key */
+ if (sym_info->auth_key_len > 0) {
+ sym_info->auth_key = g_malloc(sym_info->auth_key_len);
+ s = iov_to_buf(iov, out_num, 0, sym_info->auth_key,
+ sym_info->auth_key_len);
+ if (unlikely(s != sym_info->auth_key_len)) {
+ virtio_error(vdev,
+ "virtio-crypto authenticated key incorrect");
+ return -EFAULT;
+ }
+ iov_discard_front(&iov, &out_num, sym_info->auth_key_len);
+ }
+ } else if (sym_info->hash_mode == VIRTIO_CRYPTO_SYM_HASH_MODE_PLAIN) {
+ sym_info->hash_alg = ldl_le_p(
+ &sess_req->u.chain.para.u.hash_param.algo);
+ sym_info->hash_result_len = ldl_le_p(
+ &sess_req->u.chain.para.u.hash_param.hash_result_len);
+ } else {
+ /* VIRTIO_CRYPTO_SYM_HASH_MODE_NESTED */
+ error_report("unsupported hash mode");
+ return -VIRTIO_CRYPTO_NOTSUPP;
+ }
+ } else {
+ /* VIRTIO_CRYPTO_SYM_OP_NONE */
+ error_report("unsupported cipher op_type: VIRTIO_CRYPTO_SYM_OP_NONE");
+ return -VIRTIO_CRYPTO_NOTSUPP;
+ }
+
+ queue_index = virtio_crypto_vq2q(queue_id);
+ return cryptodev_backend_create_session(vcrypto->cryptodev, &sreq->info,
+ queue_index, sreq->cb, sreq);
+}
+
+static int
+virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto,
+ struct virtio_crypto_akcipher_create_session_req *sess_req,
+ uint32_t queue_id, uint32_t opcode,
+ struct iovec *iov, unsigned int out_num,
+ VirtIOCryptoSessionReq *sreq)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ CryptoDevBackendAsymSessionInfo *asym_info = &sreq->info.u.asym_sess_info;
+ int queue_index;
+ uint32_t algo, keytype, keylen;
+
+ algo = ldl_le_p(&sess_req->para.algo);
+ keytype = ldl_le_p(&sess_req->para.keytype);
+ keylen = ldl_le_p(&sess_req->para.keylen);
+
+ if ((keytype != VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PUBLIC)
+ && (keytype != VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PRIVATE)) {
+ error_report("unsupported asym keytype: %d", keytype);
+ return -VIRTIO_CRYPTO_NOTSUPP;
+ }
+
+ if (keylen) {
+ asym_info->key = g_malloc(keylen);
+ if (iov_to_buf(iov, out_num, 0, asym_info->key, keylen) != keylen) {
+ virtio_error(vdev, "virtio-crypto asym key incorrect");
+ return -EFAULT;
+ }
+ iov_discard_front(&iov, &out_num, keylen);
+ }
+
+ sreq->info.op_code = opcode;
+ asym_info = &sreq->info.u.asym_sess_info;
+ asym_info->algo = algo;
+ asym_info->keytype = keytype;
+ asym_info->keylen = keylen;
+ switch (asym_info->algo) {
+ case VIRTIO_CRYPTO_AKCIPHER_RSA:
+ asym_info->u.rsa.padding_algo =
+ ldl_le_p(&sess_req->para.u.rsa.padding_algo);
+ asym_info->u.rsa.hash_algo =
+ ldl_le_p(&sess_req->para.u.rsa.hash_algo);
+ break;
+
+ /* TODO DSA&ECDSA handling */
+
+ default:
+ return -VIRTIO_CRYPTO_ERR;
+ }
+
+ queue_index = virtio_crypto_vq2q(queue_id);
+ return cryptodev_backend_create_session(vcrypto->cryptodev, &sreq->info,
+ queue_index, sreq->cb, sreq);
+}
+
+static int
+virtio_crypto_handle_close_session(VirtIOCrypto *vcrypto,
+ struct virtio_crypto_destroy_session_req *close_sess_req,
+ uint32_t queue_id,
+ VirtIOCryptoSessionReq *sreq)
+{
+ uint64_t session_id;
+
+ session_id = ldq_le_p(&close_sess_req->session_id);
+ DPRINTF("close session, id=%" PRIu64 "\n", session_id);
+
+ return cryptodev_backend_close_session(
+ vcrypto->cryptodev, session_id, queue_id, sreq->cb, sreq);
+}
+
+static void virtio_crypto_create_session_completion(void *opaque, int ret)
+{
+ VirtIOCryptoSessionReq *sreq = (VirtIOCryptoSessionReq *)opaque;
+ VirtQueue *vq = sreq->vq;
+ VirtQueueElement *elem = sreq->elem;
+ VirtIODevice *vdev = sreq->vdev;
+ struct virtio_crypto_session_input input;
+ struct iovec *in_iov = elem->in_sg;
+ unsigned in_num = elem->in_num;
+ size_t s;
+
+ memset(&input, 0, sizeof(input));
+ /* Serious errors, need to reset virtio crypto device */
+ if (ret == -EFAULT) {
+ virtqueue_detach_element(vq, elem, 0);
+ goto out;
+ } else if (ret == -VIRTIO_CRYPTO_NOTSUPP) {
+ stl_le_p(&input.status, VIRTIO_CRYPTO_NOTSUPP);
+ } else if (ret == -VIRTIO_CRYPTO_KEY_REJECTED) {
+ stl_le_p(&input.status, VIRTIO_CRYPTO_KEY_REJECTED);
+ } else if (ret != VIRTIO_CRYPTO_OK) {
+ stl_le_p(&input.status, VIRTIO_CRYPTO_ERR);
+ } else {
+ /* Set the session id */
+ stq_le_p(&input.session_id, sreq->info.session_id);
+ stl_le_p(&input.status, VIRTIO_CRYPTO_OK);
+ }
+
+ s = iov_from_buf(in_iov, in_num, 0, &input, sizeof(input));
+ if (unlikely(s != sizeof(input))) {
+ virtio_error(vdev, "virtio-crypto input incorrect");
+ virtqueue_detach_element(vq, elem, 0);
+ goto out;
+ }
+ virtqueue_push(vq, elem, sizeof(input));
+ virtio_notify(vdev, vq);
+
+out:
+ g_free(elem);
+ virtio_crypto_free_create_session_req(sreq);
+}
+
+static void virtio_crypto_destroy_session_completion(void *opaque, int ret)
+{
+ VirtIOCryptoSessionReq *sreq = (VirtIOCryptoSessionReq *)opaque;
+ VirtQueue *vq = sreq->vq;
+ VirtQueueElement *elem = sreq->elem;
+ VirtIODevice *vdev = sreq->vdev;
+ struct iovec *in_iov = elem->in_sg;
+ unsigned in_num = elem->in_num;
+ uint8_t status;
+ size_t s;
+
+ if (ret < 0) {
+ status = VIRTIO_CRYPTO_ERR;
+ } else {
+ status = VIRTIO_CRYPTO_OK;
+ }
+ s = iov_from_buf(in_iov, in_num, 0, &status, sizeof(status));
+ if (unlikely(s != sizeof(status))) {
+ virtio_error(vdev, "virtio-crypto status incorrect");
+ virtqueue_detach_element(vq, elem, 0);
+ goto out;
+ }
+ virtqueue_push(vq, elem, sizeof(status));
+ virtio_notify(vdev, vq);
+
+out:
+ g_free(elem);
+ g_free(sreq);
+}
+
+static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ struct virtio_crypto_op_ctrl_req ctrl;
+ VirtQueueElement *elem;
+ VirtIOCryptoSessionReq *sreq;
+ unsigned out_num;
+ unsigned in_num;
+ uint32_t queue_id;
+ uint32_t opcode;
+ struct virtio_crypto_session_input input;
+ size_t s;
+ int ret;
+ struct iovec *out_iov;
+ struct iovec *in_iov;
+
+ for (;;) {
+ g_autofree struct iovec *out_iov_copy = NULL;
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ break;
+ }
+ if (elem->out_num < 1 || elem->in_num < 1) {
+ virtio_error(vdev, "virtio-crypto ctrl missing headers");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ break;
+ }
+
+ out_num = elem->out_num;
+ out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
+ out_iov = out_iov_copy;
+
+ in_num = elem->in_num;
+ in_iov = elem->in_sg;
+
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &ctrl, sizeof(ctrl))
+ != sizeof(ctrl))) {
+ virtio_error(vdev, "virtio-crypto request ctrl_hdr too short");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ break;
+ }
+ iov_discard_front(&out_iov, &out_num, sizeof(ctrl));
+
+ opcode = ldl_le_p(&ctrl.header.opcode);
+ queue_id = ldl_le_p(&ctrl.header.queue_id);
+
+ sreq = g_new0(VirtIOCryptoSessionReq, 1);
+ sreq->vdev = vdev;
+ sreq->vq = vq;
+ sreq->elem = elem;
+
+ switch (opcode) {
+ case VIRTIO_CRYPTO_CIPHER_CREATE_SESSION:
+ sreq->cb = virtio_crypto_create_session_completion;
+ ret = virtio_crypto_create_sym_session(vcrypto,
+ &ctrl.u.sym_create_session,
+ queue_id, opcode,
+ out_iov, out_num,
+ sreq);
+ if (ret < 0) {
+ virtio_crypto_create_session_completion(sreq, ret);
+ }
+ break;
+
+ case VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION:
+ sreq->cb = virtio_crypto_create_session_completion;
+ ret = virtio_crypto_create_asym_session(vcrypto,
+ &ctrl.u.akcipher_create_session,
+ queue_id, opcode,
+ out_iov, out_num,
+ sreq);
+ if (ret < 0) {
+ virtio_crypto_create_session_completion(sreq, ret);
+ }
+ break;
+
+ case VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_HASH_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_MAC_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_AEAD_DESTROY_SESSION:
+ case VIRTIO_CRYPTO_AKCIPHER_DESTROY_SESSION:
+ sreq->cb = virtio_crypto_destroy_session_completion;
+ ret = virtio_crypto_handle_close_session(vcrypto,
+ &ctrl.u.destroy_session, queue_id,
+ sreq);
+ if (ret < 0) {
+ virtio_crypto_destroy_session_completion(sreq, ret);
+ }
+ break;
+
+ case VIRTIO_CRYPTO_HASH_CREATE_SESSION:
+ case VIRTIO_CRYPTO_MAC_CREATE_SESSION:
+ case VIRTIO_CRYPTO_AEAD_CREATE_SESSION:
+ default:
+ memset(&input, 0, sizeof(input));
+ error_report("virtio-crypto unsupported ctrl opcode: %d", opcode);
+ stl_le_p(&input.status, VIRTIO_CRYPTO_NOTSUPP);
+ s = iov_from_buf(in_iov, in_num, 0, &input, sizeof(input));
+ if (unlikely(s != sizeof(input))) {
+ virtio_error(vdev, "virtio-crypto input incorrect");
+ virtqueue_detach_element(vq, elem, 0);
+ } else {
+ virtqueue_push(vq, elem, sizeof(input));
+ virtio_notify(vdev, vq);
+ }
+ g_free(sreq);
+ g_free(elem);
+
+ break;
+ } /* end switch case */
+
+ } /* end for loop */
+}
+
+static void virtio_crypto_init_request(VirtIOCrypto *vcrypto, VirtQueue *vq,
+ VirtIOCryptoReq *req)
+{
+ req->vcrypto = vcrypto;
+ req->vq = vq;
+ req->in = NULL;
+ req->in_iov = NULL;
+ req->in_num = 0;
+ req->in_len = 0;
+ req->flags = CRYPTODEV_BACKEND_ALG__MAX;
+ memset(&req->op_info, 0x00, sizeof(req->op_info));
+}
+
+static void virtio_crypto_free_request(VirtIOCryptoReq *req)
+{
+ if (!req) {
+ return;
+ }
+
+ if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+ size_t max_len;
+ CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info;
+
+ max_len = op_info->iv_len +
+ op_info->aad_len +
+ op_info->src_len +
+ op_info->dst_len +
+ op_info->digest_result_len;
+
+ /* Zeroize and free request data structure */
+ memset(op_info, 0, sizeof(*op_info) + max_len);
+ g_free(op_info);
+ } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) {
+ CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info;
+ if (op_info) {
+ g_free(op_info->src);
+ g_free(op_info->dst);
+ memset(op_info, 0, sizeof(*op_info));
+ g_free(op_info);
+ }
+ }
+
+ g_free(req->in_iov);
+ g_free(req);
+}
+
+static void
+virtio_crypto_sym_input_data_helper(VirtIODevice *vdev,
+ VirtIOCryptoReq *req,
+ uint32_t status,
+ CryptoDevBackendSymOpInfo *sym_op_info)
+{
+ size_t s, len;
+ struct iovec *in_iov = req->in_iov;
+
+ if (status != VIRTIO_CRYPTO_OK) {
+ return;
+ }
+
+ len = sym_op_info->src_len;
+ /* Save the cipher result */
+ s = iov_from_buf(in_iov, req->in_num, 0, sym_op_info->dst, len);
+ if (s != len) {
+ virtio_error(vdev, "virtio-crypto dest data incorrect");
+ return;
+ }
+
+ iov_discard_front(&in_iov, &req->in_num, len);
+
+ if (sym_op_info->op_type ==
+ VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) {
+ /* Save the digest result */
+ s = iov_from_buf(in_iov, req->in_num, 0,
+ sym_op_info->digest_result,
+ sym_op_info->digest_result_len);
+ if (s != sym_op_info->digest_result_len) {
+ virtio_error(vdev, "virtio-crypto digest result incorrect");
+ }
+ }
+}
+
+static void
+virtio_crypto_akcipher_input_data_helper(VirtIODevice *vdev,
+ VirtIOCryptoReq *req, int32_t status,
+ CryptoDevBackendAsymOpInfo *asym_op_info)
+{
+ size_t s, len;
+ struct iovec *in_iov = req->in_iov;
+
+ if (status != VIRTIO_CRYPTO_OK) {
+ return;
+ }
+
+ len = asym_op_info->dst_len;
+ if (!len) {
+ return;
+ }
+
+ s = iov_from_buf(in_iov, req->in_num, 0, asym_op_info->dst, len);
+ if (s != len) {
+ virtio_error(vdev, "virtio-crypto asym dest data incorrect");
+ return;
+ }
+
+ iov_discard_front(&in_iov, &req->in_num, len);
+
+ /* For akcipher, dst_len may be changed after operation */
+ req->in_len = sizeof(struct virtio_crypto_inhdr) + asym_op_info->dst_len;
+}
+
+static void virtio_crypto_req_complete(void *opaque, int ret)
+{
+ VirtIOCryptoReq *req = (VirtIOCryptoReq *)opaque;
+ VirtIOCrypto *vcrypto = req->vcrypto;
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ uint8_t status = -ret;
+
+ if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+ virtio_crypto_sym_input_data_helper(vdev, req, status,
+ req->op_info.u.sym_op_info);
+ } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) {
+ virtio_crypto_akcipher_input_data_helper(vdev, req, status,
+ req->op_info.u.asym_op_info);
+ }
+ stb_p(&req->in->status, status);
+ virtqueue_push(req->vq, &req->elem, req->in_len);
+ virtio_notify(vdev, req->vq);
+ virtio_crypto_free_request(req);
+}
+
+static VirtIOCryptoReq *
+virtio_crypto_get_request(VirtIOCrypto *s, VirtQueue *vq)
+{
+ VirtIOCryptoReq *req = virtqueue_pop(vq, sizeof(VirtIOCryptoReq));
+
+ if (req) {
+ virtio_crypto_init_request(s, vq, req);
+ }
+ return req;
+}
+
+static CryptoDevBackendSymOpInfo *
+virtio_crypto_sym_op_helper(VirtIODevice *vdev,
+ struct virtio_crypto_cipher_para *cipher_para,
+ struct virtio_crypto_alg_chain_data_para *alg_chain_para,
+ struct iovec *iov, unsigned int out_num)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ CryptoDevBackendSymOpInfo *op_info;
+ uint32_t src_len = 0, dst_len = 0;
+ uint32_t iv_len = 0;
+ uint32_t aad_len = 0, hash_result_len = 0;
+ uint32_t hash_start_src_offset = 0, len_to_hash = 0;
+ uint32_t cipher_start_src_offset = 0, len_to_cipher = 0;
+
+ uint64_t max_len, curr_size = 0;
+ size_t s;
+
+ /* Plain cipher */
+ if (cipher_para) {
+ iv_len = ldl_le_p(&cipher_para->iv_len);
+ src_len = ldl_le_p(&cipher_para->src_data_len);
+ dst_len = ldl_le_p(&cipher_para->dst_data_len);
+ } else if (alg_chain_para) { /* Algorithm chain */
+ iv_len = ldl_le_p(&alg_chain_para->iv_len);
+ src_len = ldl_le_p(&alg_chain_para->src_data_len);
+ dst_len = ldl_le_p(&alg_chain_para->dst_data_len);
+
+ aad_len = ldl_le_p(&alg_chain_para->aad_len);
+ hash_result_len = ldl_le_p(&alg_chain_para->hash_result_len);
+ hash_start_src_offset = ldl_le_p(
+ &alg_chain_para->hash_start_src_offset);
+ cipher_start_src_offset = ldl_le_p(
+ &alg_chain_para->cipher_start_src_offset);
+ len_to_cipher = ldl_le_p(&alg_chain_para->len_to_cipher);
+ len_to_hash = ldl_le_p(&alg_chain_para->len_to_hash);
+ } else {
+ return NULL;
+ }
+
+ max_len = (uint64_t)iv_len + aad_len + src_len + dst_len + hash_result_len;
+ if (unlikely(max_len > vcrypto->conf.max_size)) {
+ virtio_error(vdev, "virtio-crypto too big length");
+ return NULL;
+ }
+
+ op_info = g_malloc0(sizeof(CryptoDevBackendSymOpInfo) + max_len);
+ op_info->iv_len = iv_len;
+ op_info->src_len = src_len;
+ op_info->dst_len = dst_len;
+ op_info->aad_len = aad_len;
+ op_info->digest_result_len = hash_result_len;
+ op_info->hash_start_src_offset = hash_start_src_offset;
+ op_info->len_to_hash = len_to_hash;
+ op_info->cipher_start_src_offset = cipher_start_src_offset;
+ op_info->len_to_cipher = len_to_cipher;
+ /* Handle the initilization vector */
+ if (op_info->iv_len > 0) {
+ DPRINTF("iv_len=%" PRIu32 "\n", op_info->iv_len);
+ op_info->iv = op_info->data + curr_size;
+
+ s = iov_to_buf(iov, out_num, 0, op_info->iv, op_info->iv_len);
+ if (unlikely(s != op_info->iv_len)) {
+ virtio_error(vdev, "virtio-crypto iv incorrect");
+ goto err;
+ }
+ iov_discard_front(&iov, &out_num, op_info->iv_len);
+ curr_size += op_info->iv_len;
+ }
+
+ /* Handle additional authentication data if exists */
+ if (op_info->aad_len > 0) {
+ DPRINTF("aad_len=%" PRIu32 "\n", op_info->aad_len);
+ op_info->aad_data = op_info->data + curr_size;
+
+ s = iov_to_buf(iov, out_num, 0, op_info->aad_data, op_info->aad_len);
+ if (unlikely(s != op_info->aad_len)) {
+ virtio_error(vdev, "virtio-crypto additional auth data incorrect");
+ goto err;
+ }
+ iov_discard_front(&iov, &out_num, op_info->aad_len);
+
+ curr_size += op_info->aad_len;
+ }
+
+ /* Handle the source data */
+ if (op_info->src_len > 0) {
+ DPRINTF("src_len=%" PRIu32 "\n", op_info->src_len);
+ op_info->src = op_info->data + curr_size;
+
+ s = iov_to_buf(iov, out_num, 0, op_info->src, op_info->src_len);
+ if (unlikely(s != op_info->src_len)) {
+ virtio_error(vdev, "virtio-crypto source data incorrect");
+ goto err;
+ }
+ iov_discard_front(&iov, &out_num, op_info->src_len);
+
+ curr_size += op_info->src_len;
+ }
+
+ /* Handle the destination data */
+ op_info->dst = op_info->data + curr_size;
+ curr_size += op_info->dst_len;
+
+ DPRINTF("dst_len=%" PRIu32 "\n", op_info->dst_len);
+
+ /* Handle the hash digest result */
+ if (hash_result_len > 0) {
+ DPRINTF("hash_result_len=%" PRIu32 "\n", hash_result_len);
+ op_info->digest_result = op_info->data + curr_size;
+ }
+
+ return op_info;
+
+err:
+ g_free(op_info);
+ return NULL;
+}
+
+static int
+virtio_crypto_handle_sym_req(VirtIOCrypto *vcrypto,
+ struct virtio_crypto_sym_data_req *req,
+ CryptoDevBackendOpInfo *op_info,
+ struct iovec *iov, unsigned int out_num)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ CryptoDevBackendSymOpInfo *sym_op_info;
+ uint32_t op_type;
+
+ op_type = ldl_le_p(&req->op_type);
+ if (op_type == VIRTIO_CRYPTO_SYM_OP_CIPHER) {
+ sym_op_info = virtio_crypto_sym_op_helper(vdev, &req->u.cipher.para,
+ NULL, iov, out_num);
+ if (!sym_op_info) {
+ return -EFAULT;
+ }
+ } else if (op_type == VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) {
+ sym_op_info = virtio_crypto_sym_op_helper(vdev, NULL,
+ &req->u.chain.para,
+ iov, out_num);
+ if (!sym_op_info) {
+ return -EFAULT;
+ }
+ } else {
+ /* VIRTIO_CRYPTO_SYM_OP_NONE */
+ error_report("virtio-crypto unsupported cipher type");
+ return -VIRTIO_CRYPTO_NOTSUPP;
+ }
+
+ sym_op_info->op_type = op_type;
+ op_info->u.sym_op_info = sym_op_info;
+
+ return 0;
+}
+
+static int
+virtio_crypto_handle_asym_req(VirtIOCrypto *vcrypto,
+ struct virtio_crypto_akcipher_data_req *req,
+ CryptoDevBackendOpInfo *op_info,
+ struct iovec *iov, unsigned int out_num)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ CryptoDevBackendAsymOpInfo *asym_op_info;
+ uint32_t src_len;
+ uint32_t dst_len;
+ uint32_t len;
+ uint8_t *src = NULL;
+ uint8_t *dst = NULL;
+
+ asym_op_info = g_new0(CryptoDevBackendAsymOpInfo, 1);
+ src_len = ldl_le_p(&req->para.src_data_len);
+ dst_len = ldl_le_p(&req->para.dst_data_len);
+
+ if (src_len > 0) {
+ src = g_malloc0(src_len);
+ len = iov_to_buf(iov, out_num, 0, src, src_len);
+ if (unlikely(len != src_len)) {
+ virtio_error(vdev, "virtio-crypto asym src data incorrect"
+ "expected %u, actual %u", src_len, len);
+ goto err;
+ }
+
+ iov_discard_front(&iov, &out_num, src_len);
+ }
+
+ if (dst_len > 0) {
+ dst = g_malloc0(dst_len);
+
+ if (op_info->op_code == VIRTIO_CRYPTO_AKCIPHER_VERIFY) {
+ len = iov_to_buf(iov, out_num, 0, dst, dst_len);
+ if (unlikely(len != dst_len)) {
+ virtio_error(vdev, "virtio-crypto asym dst data incorrect"
+ "expected %u, actual %u", dst_len, len);
+ goto err;
+ }
+
+ iov_discard_front(&iov, &out_num, dst_len);
+ }
+ }
+
+ asym_op_info->src_len = src_len;
+ asym_op_info->dst_len = dst_len;
+ asym_op_info->src = src;
+ asym_op_info->dst = dst;
+ op_info->u.asym_op_info = asym_op_info;
+
+ return 0;
+
+ err:
+ g_free(asym_op_info);
+ g_free(src);
+ g_free(dst);
+
+ return -EFAULT;
+}
+
+static int
+virtio_crypto_handle_request(VirtIOCryptoReq *request)
+{
+ VirtIOCrypto *vcrypto = request->vcrypto;
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+ VirtQueueElement *elem = &request->elem;
+ int queue_index = virtio_crypto_vq2q(virtio_get_queue_index(request->vq));
+ struct virtio_crypto_op_data_req req;
+ int ret;
+ g_autofree struct iovec *in_iov_copy = NULL;
+ g_autofree struct iovec *out_iov_copy = NULL;
+ struct iovec *in_iov;
+ struct iovec *out_iov;
+ unsigned in_num;
+ unsigned out_num;
+ uint32_t opcode;
+ CryptoDevBackendOpInfo *op_info = &request->op_info;
+
+ if (elem->out_num < 1 || elem->in_num < 1) {
+ virtio_error(vdev, "virtio-crypto dataq missing headers");
+ return -1;
+ }
+
+ out_num = elem->out_num;
+ out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num);
+ out_iov = out_iov_copy;
+
+ in_num = elem->in_num;
+ in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num);
+ in_iov = in_iov_copy;
+
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &req, sizeof(req))
+ != sizeof(req))) {
+ virtio_error(vdev, "virtio-crypto request outhdr too short");
+ return -1;
+ }
+ iov_discard_front(&out_iov, &out_num, sizeof(req));
+
+ if (in_iov[in_num - 1].iov_len <
+ sizeof(struct virtio_crypto_inhdr)) {
+ virtio_error(vdev, "virtio-crypto request inhdr too short");
+ return -1;
+ }
+ /* We always touch the last byte, so just see how big in_iov is. */
+ request->in_len = iov_size(in_iov, in_num);
+ request->in = (void *)in_iov[in_num - 1].iov_base
+ + in_iov[in_num - 1].iov_len
+ - sizeof(struct virtio_crypto_inhdr);
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_crypto_inhdr));
+
+ /*
+ * The length of operation result, including dest_data
+ * and digest_result if exists.
+ */
+ request->in_num = in_num;
+ request->in_iov = in_iov;
+ /* now, we free the in_iov_copy inside virtio_crypto_free_request */
+ in_iov_copy = NULL;
+
+ opcode = ldl_le_p(&req.header.opcode);
+ op_info->session_id = ldq_le_p(&req.header.session_id);
+ op_info->op_code = opcode;
+
+ switch (opcode) {
+ case VIRTIO_CRYPTO_CIPHER_ENCRYPT:
+ case VIRTIO_CRYPTO_CIPHER_DECRYPT:
+ op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_SYM;
+ ret = virtio_crypto_handle_sym_req(vcrypto,
+ &req.u.sym_req, op_info,
+ out_iov, out_num);
+ goto check_result;
+
+ case VIRTIO_CRYPTO_AKCIPHER_ENCRYPT:
+ case VIRTIO_CRYPTO_AKCIPHER_DECRYPT:
+ case VIRTIO_CRYPTO_AKCIPHER_SIGN:
+ case VIRTIO_CRYPTO_AKCIPHER_VERIFY:
+ op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_ASYM;
+ ret = virtio_crypto_handle_asym_req(vcrypto,
+ &req.u.akcipher_req, op_info,
+ out_iov, out_num);
+
+check_result:
+ /* Serious errors, need to reset virtio crypto device */
+ if (ret == -EFAULT) {
+ return -1;
+ } else if (ret == -VIRTIO_CRYPTO_NOTSUPP) {
+ virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP);
+ } else {
+ ret = cryptodev_backend_crypto_operation(vcrypto->cryptodev,
+ request, queue_index,
+ virtio_crypto_req_complete,
+ request);
+ if (ret < 0) {
+ virtio_crypto_req_complete(request, ret);
+ }
+ }
+ break;
+
+ case VIRTIO_CRYPTO_HASH:
+ case VIRTIO_CRYPTO_MAC:
+ case VIRTIO_CRYPTO_AEAD_ENCRYPT:
+ case VIRTIO_CRYPTO_AEAD_DECRYPT:
+ default:
+ error_report("virtio-crypto unsupported dataq opcode: %u",
+ opcode);
+ virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP);
+ }
+
+ return 0;
+}
+
+static void virtio_crypto_handle_dataq(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ VirtIOCryptoReq *req;
+
+ while ((req = virtio_crypto_get_request(vcrypto, vq))) {
+ if (virtio_crypto_handle_request(req) < 0) {
+ virtqueue_detach_element(req->vq, &req->elem, 0);
+ virtio_crypto_free_request(req);
+ break;
+ }
+ }
+}
+
+static void virtio_crypto_dataq_bh(void *opaque)
+{
+ VirtIOCryptoQueue *q = opaque;
+ VirtIOCrypto *vcrypto = q->vcrypto;
+ VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto);
+
+ /* This happens when device was stopped but BH wasn't. */
+ if (!vdev->vm_running) {
+ return;
+ }
+
+ /* Just in case the driver is not ready on more */
+ if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
+ return;
+ }
+
+ for (;;) {
+ virtio_crypto_handle_dataq(vdev, q->dataq);
+ virtio_queue_set_notification(q->dataq, 1);
+
+ /* Are we done or did the guest add more buffers? */
+ if (virtio_queue_empty(q->dataq)) {
+ break;
+ }
+
+ virtio_queue_set_notification(q->dataq, 0);
+ }
+}
+
+static void
+virtio_crypto_handle_dataq_bh(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ VirtIOCryptoQueue *q =
+ &vcrypto->vqs[virtio_crypto_vq2q(virtio_get_queue_index(vq))];
+
+ /* This happens when device was stopped but VCPU wasn't. */
+ if (!vdev->vm_running) {
+ return;
+ }
+ virtio_queue_set_notification(vq, 0);
+ qemu_bh_schedule(q->dataq_bh);
+}
+
+static uint64_t virtio_crypto_get_features(VirtIODevice *vdev,
+ uint64_t features,
+ Error **errp)
+{
+ return features;
+}
+
+static void virtio_crypto_reset(VirtIODevice *vdev)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ /* multiqueue is disabled by default */
+ vcrypto->curr_queues = 1;
+ if (!cryptodev_backend_is_ready(vcrypto->cryptodev)) {
+ vcrypto->status &= ~VIRTIO_CRYPTO_S_HW_READY;
+ } else {
+ vcrypto->status |= VIRTIO_CRYPTO_S_HW_READY;
+ }
+}
+
+static void virtio_crypto_init_config(VirtIODevice *vdev)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+
+ vcrypto->conf.crypto_services =
+ vcrypto->conf.cryptodev->conf.crypto_services;
+ vcrypto->conf.cipher_algo_l =
+ vcrypto->conf.cryptodev->conf.cipher_algo_l;
+ vcrypto->conf.cipher_algo_h =
+ vcrypto->conf.cryptodev->conf.cipher_algo_h;
+ vcrypto->conf.hash_algo = vcrypto->conf.cryptodev->conf.hash_algo;
+ vcrypto->conf.mac_algo_l = vcrypto->conf.cryptodev->conf.mac_algo_l;
+ vcrypto->conf.mac_algo_h = vcrypto->conf.cryptodev->conf.mac_algo_h;
+ vcrypto->conf.aead_algo = vcrypto->conf.cryptodev->conf.aead_algo;
+ vcrypto->conf.akcipher_algo = vcrypto->conf.cryptodev->conf.akcipher_algo;
+ vcrypto->conf.max_cipher_key_len =
+ vcrypto->conf.cryptodev->conf.max_cipher_key_len;
+ vcrypto->conf.max_auth_key_len =
+ vcrypto->conf.cryptodev->conf.max_auth_key_len;
+ vcrypto->conf.max_size = vcrypto->conf.cryptodev->conf.max_size;
+}
+
+static void virtio_crypto_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+ int i;
+
+ vcrypto->cryptodev = vcrypto->conf.cryptodev;
+ if (vcrypto->cryptodev == NULL) {
+ error_setg(errp, "'cryptodev' parameter expects a valid object");
+ return;
+ } else if (cryptodev_backend_is_used(vcrypto->cryptodev)) {
+ error_setg(errp, "can't use already used cryptodev backend: %s",
+ object_get_canonical_path_component(OBJECT(vcrypto->conf.cryptodev)));
+ return;
+ }
+
+ vcrypto->max_queues = MAX(vcrypto->cryptodev->conf.peers.queues, 1);
+ if (vcrypto->max_queues + 1 > VIRTIO_QUEUE_MAX) {
+ error_setg(errp, "Invalid number of queues (= %" PRIu32 "), "
+ "must be a positive integer less than %d.",
+ vcrypto->max_queues, VIRTIO_QUEUE_MAX);
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_CRYPTO, vcrypto->config_size);
+ vcrypto->curr_queues = 1;
+ vcrypto->vqs = g_new0(VirtIOCryptoQueue, vcrypto->max_queues);
+ for (i = 0; i < vcrypto->max_queues; i++) {
+ vcrypto->vqs[i].dataq =
+ virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh);
+ vcrypto->vqs[i].dataq_bh =
+ qemu_bh_new(virtio_crypto_dataq_bh, &vcrypto->vqs[i]);
+ vcrypto->vqs[i].vcrypto = vcrypto;
+ }
+
+ vcrypto->ctrl_vq = virtio_add_queue(vdev, 1024, virtio_crypto_handle_ctrl);
+ if (!cryptodev_backend_is_ready(vcrypto->cryptodev)) {
+ vcrypto->status &= ~VIRTIO_CRYPTO_S_HW_READY;
+ } else {
+ vcrypto->status |= VIRTIO_CRYPTO_S_HW_READY;
+ }
+
+ virtio_crypto_init_config(vdev);
+ cryptodev_backend_set_used(vcrypto->cryptodev, true);
+}
+
+static void virtio_crypto_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev);
+ VirtIOCryptoQueue *q;
+ int i, max_queues;
+
+ max_queues = vcrypto->multiqueue ? vcrypto->max_queues : 1;
+ for (i = 0; i < max_queues; i++) {
+ virtio_delete_queue(vcrypto->vqs[i].dataq);
+ q = &vcrypto->vqs[i];
+ qemu_bh_delete(q->dataq_bh);
+ }
+
+ g_free(vcrypto->vqs);
+ virtio_delete_queue(vcrypto->ctrl_vq);
+
+ virtio_cleanup(vdev);
+ cryptodev_backend_set_used(vcrypto->cryptodev, false);
+}
+
+static const VMStateDescription vmstate_virtio_crypto = {
+ .name = "virtio-crypto",
+ .unmigratable = 1,
+ .minimum_version_id = VIRTIO_CRYPTO_VM_VERSION,
+ .version_id = VIRTIO_CRYPTO_VM_VERSION,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property virtio_crypto_properties[] = {
+ DEFINE_PROP_LINK("cryptodev", VirtIOCrypto, conf.cryptodev,
+ TYPE_CRYPTODEV_BACKEND, CryptoDevBackend *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_crypto_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VirtIOCrypto *c = VIRTIO_CRYPTO(vdev);
+ struct virtio_crypto_config crypto_cfg = {};
+
+ /*
+ * Virtio-crypto device conforms to VIRTIO 1.0 which is always LE,
+ * so we can use LE accessors directly.
+ */
+ stl_le_p(&crypto_cfg.status, c->status);
+ stl_le_p(&crypto_cfg.max_dataqueues, c->max_queues);
+ stl_le_p(&crypto_cfg.crypto_services, c->conf.crypto_services);
+ stl_le_p(&crypto_cfg.cipher_algo_l, c->conf.cipher_algo_l);
+ stl_le_p(&crypto_cfg.cipher_algo_h, c->conf.cipher_algo_h);
+ stl_le_p(&crypto_cfg.hash_algo, c->conf.hash_algo);
+ stl_le_p(&crypto_cfg.mac_algo_l, c->conf.mac_algo_l);
+ stl_le_p(&crypto_cfg.mac_algo_h, c->conf.mac_algo_h);
+ stl_le_p(&crypto_cfg.aead_algo, c->conf.aead_algo);
+ stl_le_p(&crypto_cfg.max_cipher_key_len, c->conf.max_cipher_key_len);
+ stl_le_p(&crypto_cfg.max_auth_key_len, c->conf.max_auth_key_len);
+ stq_le_p(&crypto_cfg.max_size, c->conf.max_size);
+ stl_le_p(&crypto_cfg.akcipher_algo, c->conf.akcipher_algo);
+
+ memcpy(config, &crypto_cfg, c->config_size);
+}
+
+static bool virtio_crypto_started(VirtIOCrypto *c, uint8_t status)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(c);
+ return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+ (c->status & VIRTIO_CRYPTO_S_HW_READY) && vdev->vm_running;
+}
+
+static void virtio_crypto_vhost_status(VirtIOCrypto *c, uint8_t status)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(c);
+ int queues = c->multiqueue ? c->max_queues : 1;
+ CryptoDevBackend *b = c->cryptodev;
+ CryptoDevBackendClient *cc = b->conf.peers.ccs[0];
+
+ if (!cryptodev_get_vhost(cc, b, 0)) {
+ return;
+ }
+
+ if ((virtio_crypto_started(c, status)) == !!c->vhost_started) {
+ return;
+ }
+
+ if (!c->vhost_started) {
+ int r;
+
+ c->vhost_started = 1;
+ r = cryptodev_vhost_start(vdev, queues);
+ if (r < 0) {
+ error_report("unable to start vhost crypto: %d: "
+ "falling back on userspace virtio", -r);
+ c->vhost_started = 0;
+ }
+ } else {
+ cryptodev_vhost_stop(vdev, queues);
+ c->vhost_started = 0;
+ }
+}
+
+static void virtio_crypto_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+
+ virtio_crypto_vhost_status(vcrypto, status);
+}
+
+static void virtio_crypto_guest_notifier_mask(VirtIODevice *vdev, int idx,
+ bool mask)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ int queue = virtio_crypto_vq2q(idx);
+
+ assert(vcrypto->vhost_started);
+
+ cryptodev_vhost_virtqueue_mask(vdev, queue, idx, mask);
+}
+
+static bool virtio_crypto_guest_notifier_pending(VirtIODevice *vdev, int idx)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ int queue = virtio_crypto_vq2q(idx);
+
+ assert(vcrypto->vhost_started);
+
+ return cryptodev_vhost_virtqueue_pending(vdev, queue, idx);
+}
+
+static struct vhost_dev *virtio_crypto_get_vhost(VirtIODevice *vdev)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
+ CryptoDevBackend *b = vcrypto->cryptodev;
+ CryptoDevBackendClient *cc = b->conf.peers.ccs[0];
+ CryptoDevBackendVhost *vhost_crypto = cryptodev_get_vhost(cc, b, 0);
+ return &vhost_crypto->dev;
+}
+
+static void virtio_crypto_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_crypto_properties);
+ dc->vmsd = &vmstate_virtio_crypto;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_crypto_device_realize;
+ vdc->unrealize = virtio_crypto_device_unrealize;
+ vdc->get_config = virtio_crypto_get_config;
+ vdc->get_features = virtio_crypto_get_features;
+ vdc->reset = virtio_crypto_reset;
+ vdc->set_status = virtio_crypto_set_status;
+ vdc->guest_notifier_mask = virtio_crypto_guest_notifier_mask;
+ vdc->guest_notifier_pending = virtio_crypto_guest_notifier_pending;
+ vdc->get_vhost = virtio_crypto_get_vhost;
+}
+
+static void virtio_crypto_instance_init(Object *obj)
+{
+ VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(obj);
+
+ /*
+ * The default config_size is sizeof(struct virtio_crypto_config).
+ * Can be overriden with virtio_crypto_set_config_size.
+ */
+ vcrypto->config_size = sizeof(struct virtio_crypto_config);
+}
+
+static const TypeInfo virtio_crypto_info = {
+ .name = TYPE_VIRTIO_CRYPTO,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOCrypto),
+ .instance_init = virtio_crypto_instance_init,
+ .class_init = virtio_crypto_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_crypto_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-input-host-pci.c b/hw/virtio/virtio-input-host-pci.c
new file mode 100644
index 00000000..cf8a9cf9
--- /dev/null
+++ b/hw/virtio/virtio-input-host-pci.c
@@ -0,0 +1,47 @@
+/*
+ * Virtio input host PCI Bindings
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-input.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
+
+#define TYPE_VIRTIO_INPUT_HOST_PCI "virtio-input-host-pci"
+DECLARE_INSTANCE_CHECKER(VirtIOInputHostPCI, VIRTIO_INPUT_HOST_PCI,
+ TYPE_VIRTIO_INPUT_HOST_PCI)
+
+struct VirtIOInputHostPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOInputHost vdev;
+};
+
+static void virtio_host_initfn(Object *obj)
+{
+ VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_INPUT_HOST);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_input_host_pci_info = {
+ .generic_name = TYPE_VIRTIO_INPUT_HOST_PCI,
+ .parent = TYPE_VIRTIO_INPUT_PCI,
+ .instance_size = sizeof(VirtIOInputHostPCI),
+ .instance_init = virtio_host_initfn,
+};
+
+static void virtio_input_host_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_input_host_pci_info);
+}
+
+type_init(virtio_input_host_pci_register)
diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c
new file mode 100644
index 00000000..a9d09923
--- /dev/null
+++ b/hw/virtio/virtio-input-pci.c
@@ -0,0 +1,155 @@
+/*
+ * Virtio input PCI Bindings
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-input.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+
+/*
+ * virtio-input-pci: This extends VirtioPCIProxy.
+ */
+OBJECT_DECLARE_SIMPLE_TYPE(VirtIOInputPCI, VIRTIO_INPUT_PCI)
+
+struct VirtIOInputPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOInput vdev;
+};
+
+#define TYPE_VIRTIO_INPUT_HID_PCI "virtio-input-hid-pci"
+#define TYPE_VIRTIO_KEYBOARD_PCI "virtio-keyboard-pci"
+#define TYPE_VIRTIO_MOUSE_PCI "virtio-mouse-pci"
+#define TYPE_VIRTIO_TABLET_PCI "virtio-tablet-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(VirtIOInputHIDPCI, VIRTIO_INPUT_HID_PCI)
+
+struct VirtIOInputHIDPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOInputHID vdev;
+};
+
+static Property virtio_input_pci_properties[] = {
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_input_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOInputPCI *vinput = VIRTIO_INPUT_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&vinput->vdev);
+
+ virtio_pci_force_virtio_1(vpci_dev);
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_input_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_input_pci_properties);
+ k->realize = virtio_input_pci_realize;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+
+ pcidev_k->class_id = PCI_CLASS_INPUT_OTHER;
+}
+
+static void virtio_input_hid_kbd_pci_class_init(ObjectClass *klass, void *data)
+{
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ pcidev_k->class_id = PCI_CLASS_INPUT_KEYBOARD;
+}
+
+static void virtio_input_hid_mouse_pci_class_init(ObjectClass *klass,
+ void *data)
+{
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ pcidev_k->class_id = PCI_CLASS_INPUT_MOUSE;
+}
+
+static void virtio_keyboard_initfn(Object *obj)
+{
+ VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_KEYBOARD);
+}
+
+static void virtio_mouse_initfn(Object *obj)
+{
+ VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_MOUSE);
+}
+
+static void virtio_tablet_initfn(Object *obj)
+{
+ VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_TABLET);
+}
+
+static const TypeInfo virtio_input_pci_info = {
+ .name = TYPE_VIRTIO_INPUT_PCI,
+ .parent = TYPE_VIRTIO_PCI,
+ .instance_size = sizeof(VirtIOInputPCI),
+ .class_init = virtio_input_pci_class_init,
+ .abstract = true,
+};
+
+static const TypeInfo virtio_input_hid_pci_info = {
+ .name = TYPE_VIRTIO_INPUT_HID_PCI,
+ .parent = TYPE_VIRTIO_INPUT_PCI,
+ .instance_size = sizeof(VirtIOInputHIDPCI),
+ .abstract = true,
+};
+
+static const VirtioPCIDeviceTypeInfo virtio_keyboard_pci_info = {
+ .generic_name = TYPE_VIRTIO_KEYBOARD_PCI,
+ .parent = TYPE_VIRTIO_INPUT_HID_PCI,
+ .class_init = virtio_input_hid_kbd_pci_class_init,
+ .instance_size = sizeof(VirtIOInputHIDPCI),
+ .instance_init = virtio_keyboard_initfn,
+};
+
+static const VirtioPCIDeviceTypeInfo virtio_mouse_pci_info = {
+ .generic_name = TYPE_VIRTIO_MOUSE_PCI,
+ .parent = TYPE_VIRTIO_INPUT_HID_PCI,
+ .class_init = virtio_input_hid_mouse_pci_class_init,
+ .instance_size = sizeof(VirtIOInputHIDPCI),
+ .instance_init = virtio_mouse_initfn,
+};
+
+static const VirtioPCIDeviceTypeInfo virtio_tablet_pci_info = {
+ .generic_name = TYPE_VIRTIO_TABLET_PCI,
+ .parent = TYPE_VIRTIO_INPUT_HID_PCI,
+ .instance_size = sizeof(VirtIOInputHIDPCI),
+ .instance_init = virtio_tablet_initfn,
+};
+
+static void virtio_pci_input_register(void)
+{
+ /* Base types: */
+ type_register_static(&virtio_input_pci_info);
+ type_register_static(&virtio_input_hid_pci_info);
+
+ /* Implementations: */
+ virtio_pci_types_register(&virtio_keyboard_pci_info);
+ virtio_pci_types_register(&virtio_mouse_pci_info);
+ virtio_pci_types_register(&virtio_tablet_pci_info);
+}
+
+type_init(virtio_pci_input_register)
diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c
new file mode 100644
index 00000000..7ef2f9dc
--- /dev/null
+++ b/hw/virtio/virtio-iommu-pci.c
@@ -0,0 +1,112 @@
+/*
+ * Virtio IOMMU PCI Bindings
+ *
+ * Copyright (c) 2019 Red Hat, Inc.
+ * Written by Eric Auger
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 or
+ * (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "qapi/error.h"
+#include "hw/boards.h"
+#include "hw/pci/pci_bus.h"
+#include "qom/object.h"
+
+typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI;
+
+/*
+ * virtio-iommu-pci: This extends VirtioPCIProxy.
+ *
+ */
+DECLARE_INSTANCE_CHECKER(VirtIOIOMMUPCI, VIRTIO_IOMMU_PCI,
+ TYPE_VIRTIO_IOMMU_PCI)
+
+struct VirtIOIOMMUPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOIOMMU vdev;
+};
+
+static Property virtio_iommu_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI,
+ vdev.nb_reserved_regions, vdev.reserved_regions,
+ qdev_prop_reserved_region, ReservedRegion),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev);
+ PCIBus *pbus = pci_get_bus(&vpci_dev->pci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+
+ if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) {
+ error_setg(errp, "Check your machine implements a hotplug handler "
+ "for the virtio-iommu-pci device");
+ return;
+ }
+ for (int i = 0; i < s->nb_reserved_regions; i++) {
+ if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
+ s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) {
+ error_setg(errp, "reserved region %d has an invalid type", i);
+ error_append_hint(errp, "Valid values are 0 and 1\n");
+ return;
+ }
+ }
+ if (!pci_bus_is_root(pbus)) {
+ error_setg(errp, "virtio-iommu-pci must be plugged on the root bus");
+ return;
+ }
+
+ object_property_set_link(OBJECT(dev), "primary-bus",
+ OBJECT(pbus), &error_abort);
+
+ virtio_pci_force_virtio_1(vpci_dev);
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = virtio_iommu_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, virtio_iommu_pci_properties);
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+ dc->hotpluggable = false;
+}
+
+static void virtio_iommu_pci_instance_init(Object *obj)
+{
+ VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_IOMMU);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = {
+ .generic_name = TYPE_VIRTIO_IOMMU_PCI,
+ .instance_size = sizeof(VirtIOIOMMUPCI),
+ .instance_init = virtio_iommu_pci_instance_init,
+ .class_init = virtio_iommu_pci_class_init,
+};
+
+static void virtio_iommu_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_iommu_pci_info);
+}
+
+type_init(virtio_iommu_pci_register)
+
+
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
new file mode 100644
index 00000000..62e07ec2
--- /dev/null
+++ b/hw/virtio/virtio-iommu.c
@@ -0,0 +1,1425 @@
+/*
+ * virtio-iommu device
+ *
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/iov.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio.h"
+#include "sysemu/kvm.h"
+#include "sysemu/reset.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+#include "standard-headers/linux/virtio_ids.h"
+
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+
+/* Max size */
+#define VIOMMU_DEFAULT_QUEUE_SIZE 256
+#define VIOMMU_PROBE_SIZE 512
+
+typedef struct VirtIOIOMMUDomain {
+ uint32_t id;
+ bool bypass;
+ GTree *mappings;
+ QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
+} VirtIOIOMMUDomain;
+
+typedef struct VirtIOIOMMUEndpoint {
+ uint32_t id;
+ VirtIOIOMMUDomain *domain;
+ IOMMUMemoryRegion *iommu_mr;
+ QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
+} VirtIOIOMMUEndpoint;
+
+typedef struct VirtIOIOMMUInterval {
+ uint64_t low;
+ uint64_t high;
+} VirtIOIOMMUInterval;
+
+typedef struct VirtIOIOMMUMapping {
+ uint64_t phys_addr;
+ uint32_t flags;
+} VirtIOIOMMUMapping;
+
+static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
+{
+ return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
+}
+
+static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
+{
+ uint32_t sid;
+ bool bypassed;
+ VirtIOIOMMU *s = sdev->viommu;
+ VirtIOIOMMUEndpoint *ep;
+
+ sid = virtio_iommu_get_bdf(sdev);
+
+ qemu_rec_mutex_lock(&s->mutex);
+ /* need to check bypass before system reset */
+ if (!s->endpoints) {
+ bypassed = s->config.bypass;
+ goto unlock;
+ }
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+ if (!ep || !ep->domain) {
+ bypassed = s->config.bypass;
+ } else {
+ bypassed = ep->domain->bypass;
+ }
+
+unlock:
+ qemu_rec_mutex_unlock(&s->mutex);
+ return bypassed;
+}
+
+/* Return whether the device is using IOMMU translation. */
+static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
+{
+ bool use_remapping;
+
+ assert(sdev);
+
+ use_remapping = !virtio_iommu_device_bypassed(sdev);
+
+ trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
+ PCI_SLOT(sdev->devfn),
+ PCI_FUNC(sdev->devfn),
+ use_remapping);
+
+ /* Turn off first then on the other */
+ if (use_remapping) {
+ memory_region_set_enabled(&sdev->bypass_mr, false);
+ memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
+ } else {
+ memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
+ memory_region_set_enabled(&sdev->bypass_mr, true);
+ }
+
+ return use_remapping;
+}
+
+static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
+{
+ GHashTableIter iter;
+ IOMMUPciBus *iommu_pci_bus;
+ int i;
+
+ g_hash_table_iter_init(&iter, s->as_by_busptr);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
+ for (i = 0; i < PCI_DEVFN_MAX; i++) {
+ if (!iommu_pci_bus->pbdev[i]) {
+ continue;
+ }
+ virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
+ }
+ }
+}
+
+/**
+ * The bus number is used for lookup when SID based operations occur.
+ * In that case we lazily populate the IOMMUPciBus array from the bus hash
+ * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
+ * numbers may not be always initialized yet.
+ */
+static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
+{
+ IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
+
+ if (!iommu_pci_bus) {
+ GHashTableIter iter;
+
+ g_hash_table_iter_init(&iter, s->as_by_busptr);
+ while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
+ if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
+ s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
+ return iommu_pci_bus;
+ }
+ }
+ return NULL;
+ }
+ return iommu_pci_bus;
+}
+
+static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
+{
+ uint8_t bus_n, devfn;
+ IOMMUPciBus *iommu_pci_bus;
+ IOMMUDevice *dev;
+
+ bus_n = PCI_BUS_NUM(sid);
+ iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
+ if (iommu_pci_bus) {
+ devfn = sid & (PCI_DEVFN_MAX - 1);
+ dev = iommu_pci_bus->pbdev[devfn];
+ if (dev) {
+ return &dev->iommu_mr;
+ }
+ }
+ return NULL;
+}
+
+static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
+ VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
+
+ if (inta->high < intb->low) {
+ return -1;
+ } else if (intb->high < inta->low) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr,
+ IOMMUTLBEvent *event,
+ hwaddr virt_start, hwaddr virt_end)
+{
+ uint64_t delta = virt_end - virt_start;
+
+ event->entry.iova = virt_start;
+ event->entry.addr_mask = delta;
+
+ if (delta == UINT64_MAX) {
+ memory_region_notify_iommu(mr, 0, *event);
+ }
+
+ while (virt_start != virt_end + 1) {
+ uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64);
+
+ event->entry.addr_mask = mask;
+ event->entry.iova = virt_start;
+ memory_region_notify_iommu(mr, 0, *event);
+ virt_start += mask + 1;
+ if (event->entry.perm != IOMMU_NONE) {
+ event->entry.translated_addr += mask + 1;
+ }
+ }
+}
+
+static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start,
+ hwaddr virt_end, hwaddr paddr,
+ uint32_t flags)
+{
+ IOMMUTLBEvent event;
+ IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ,
+ flags & VIRTIO_IOMMU_MAP_F_WRITE);
+
+ if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) ||
+ (flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) {
+ return;
+ }
+
+ trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end,
+ paddr, perm);
+
+ event.type = IOMMU_NOTIFIER_MAP;
+ event.entry.target_as = &address_space_memory;
+ event.entry.perm = perm;
+ event.entry.translated_addr = paddr;
+
+ virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
+}
+
+static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start,
+ hwaddr virt_end)
+{
+ IOMMUTLBEvent event;
+
+ if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) {
+ return;
+ }
+
+ trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end);
+
+ event.type = IOMMU_NOTIFIER_UNMAP;
+ event.entry.target_as = &address_space_memory;
+ event.entry.perm = IOMMU_NONE;
+ event.entry.translated_addr = 0;
+
+ virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
+}
+
+static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,
+ gpointer data)
+{
+ VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
+ IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
+
+ virtio_iommu_notify_unmap(mr, interval->low, interval->high);
+
+ return false;
+}
+
+static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
+ gpointer data)
+{
+ VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
+ VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
+ IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
+
+ virtio_iommu_notify_map(mr, interval->low, interval->high,
+ mapping->phys_addr, mapping->flags);
+
+ return false;
+}
+
+static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
+{
+ VirtIOIOMMUDomain *domain = ep->domain;
+ IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
+
+ if (!ep->domain) {
+ return;
+ }
+ g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
+ ep->iommu_mr);
+ QLIST_REMOVE(ep, next);
+ ep->domain = NULL;
+ virtio_iommu_switch_address_space(sdev);
+}
+
+static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
+ uint32_t ep_id)
+{
+ VirtIOIOMMUEndpoint *ep;
+ IOMMUMemoryRegion *mr;
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+ if (ep) {
+ return ep;
+ }
+ mr = virtio_iommu_mr(s, ep_id);
+ if (!mr) {
+ return NULL;
+ }
+ ep = g_malloc0(sizeof(*ep));
+ ep->id = ep_id;
+ ep->iommu_mr = mr;
+ trace_virtio_iommu_get_endpoint(ep_id);
+ g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
+ return ep;
+}
+
+static void virtio_iommu_put_endpoint(gpointer data)
+{
+ VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
+
+ if (ep->domain) {
+ virtio_iommu_detach_endpoint_from_domain(ep);
+ }
+
+ trace_virtio_iommu_put_endpoint(ep->id);
+ g_free(ep);
+}
+
+static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
+ uint32_t domain_id,
+ bool bypass)
+{
+ VirtIOIOMMUDomain *domain;
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (domain) {
+ if (domain->bypass != bypass) {
+ return NULL;
+ }
+ return domain;
+ }
+ domain = g_malloc0(sizeof(*domain));
+ domain->id = domain_id;
+ domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+ NULL, (GDestroyNotify)g_free,
+ (GDestroyNotify)g_free);
+ domain->bypass = bypass;
+ g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
+ QLIST_INIT(&domain->endpoint_list);
+ trace_virtio_iommu_get_domain(domain_id);
+ return domain;
+}
+
+static void virtio_iommu_put_domain(gpointer data)
+{
+ VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
+ VirtIOIOMMUEndpoint *iter, *tmp;
+
+ QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
+ virtio_iommu_detach_endpoint_from_domain(iter);
+ }
+ g_tree_destroy(domain->mappings);
+ trace_virtio_iommu_put_domain(domain->id);
+ g_free(domain);
+}
+
+static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
+ int devfn)
+{
+ VirtIOIOMMU *s = opaque;
+ IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
+ static uint32_t mr_index;
+ IOMMUDevice *sdev;
+
+ if (!sbus) {
+ sbus = g_malloc0(sizeof(IOMMUPciBus) +
+ sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
+ sbus->bus = bus;
+ g_hash_table_insert(s->as_by_busptr, bus, sbus);
+ }
+
+ sdev = sbus->pbdev[devfn];
+ if (!sdev) {
+ char *name = g_strdup_printf("%s-%d-%d",
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ mr_index++, devfn);
+ sdev = sbus->pbdev[devfn] = g_new0(IOMMUDevice, 1);
+
+ sdev->viommu = s;
+ sdev->bus = bus;
+ sdev->devfn = devfn;
+
+ trace_virtio_iommu_init_iommu_mr(name);
+
+ memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
+ address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
+
+ /*
+ * Build the IOMMU disabled container with aliases to the
+ * shared MRs. Note that aliasing to a shared memory region
+ * could help the memory API to detect same FlatViews so we
+ * can have devices to share the same FlatView when in bypass
+ * mode. (either by not configuring virtio-iommu driver or with
+ * "iommu=pt"). It will greatly reduce the total number of
+ * FlatViews of the system hence VM runs faster.
+ */
+ memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
+ "system", get_system_memory(), 0,
+ memory_region_size(get_system_memory()));
+
+ memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ OBJECT(s), name,
+ UINT64_MAX);
+
+ /*
+ * Hook both the containers under the root container, we
+ * switch between iommu & bypass MRs by enable/disable
+ * corresponding sub-containers
+ */
+ memory_region_add_subregion_overlap(&sdev->root, 0,
+ MEMORY_REGION(&sdev->iommu_mr),
+ 0);
+ memory_region_add_subregion_overlap(&sdev->root, 0,
+ &sdev->bypass_mr, 0);
+
+ virtio_iommu_switch_address_space(sdev);
+ g_free(name);
+ }
+ return &sdev->as;
+}
+
+static int virtio_iommu_attach(VirtIOIOMMU *s,
+ struct virtio_iommu_req_attach *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint32_t ep_id = le32_to_cpu(req->endpoint);
+ uint32_t flags = le32_to_cpu(req->flags);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUEndpoint *ep;
+ IOMMUDevice *sdev;
+
+ trace_virtio_iommu_attach(domain_id, ep_id);
+
+ if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ ep = virtio_iommu_get_endpoint(s, ep_id);
+ if (!ep) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ if (ep->domain) {
+ VirtIOIOMMUDomain *previous_domain = ep->domain;
+ /*
+ * the device is already attached to a domain,
+ * detach it first
+ */
+ virtio_iommu_detach_endpoint_from_domain(ep);
+ if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
+ g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
+ }
+ }
+
+ domain = virtio_iommu_get_domain(s, domain_id,
+ flags & VIRTIO_IOMMU_ATTACH_F_BYPASS);
+ if (!domain) {
+ /* Incompatible bypass flag */
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+ QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
+
+ ep->domain = domain;
+ sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
+ virtio_iommu_switch_address_space(sdev);
+
+ /* Replay domain mappings on the associated memory region */
+ g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
+ ep->iommu_mr);
+
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_detach(VirtIOIOMMU *s,
+ struct virtio_iommu_req_detach *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint32_t ep_id = le32_to_cpu(req->endpoint);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUEndpoint *ep;
+
+ trace_virtio_iommu_detach(domain_id, ep_id);
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+ if (!ep) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ domain = ep->domain;
+
+ if (!domain || domain->id != domain_id) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ virtio_iommu_detach_endpoint_from_domain(ep);
+
+ if (QLIST_EMPTY(&domain->endpoint_list)) {
+ g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
+ }
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_map(VirtIOIOMMU *s,
+ struct virtio_iommu_req_map *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint64_t phys_start = le64_to_cpu(req->phys_start);
+ uint64_t virt_start = le64_to_cpu(req->virt_start);
+ uint64_t virt_end = le64_to_cpu(req->virt_end);
+ uint32_t flags = le32_to_cpu(req->flags);
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUInterval *interval;
+ VirtIOIOMMUMapping *mapping;
+ VirtIOIOMMUEndpoint *ep;
+
+ if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (!domain) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ if (domain->bypass) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ interval = g_malloc0(sizeof(*interval));
+
+ interval->low = virt_start;
+ interval->high = virt_end;
+
+ mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
+ if (mapping) {
+ g_free(interval);
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
+
+ mapping = g_malloc0(sizeof(*mapping));
+ mapping->phys_addr = phys_start;
+ mapping->flags = flags;
+
+ g_tree_insert(domain->mappings, interval, mapping);
+
+ QLIST_FOREACH(ep, &domain->endpoint_list, next) {
+ virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start,
+ flags);
+ }
+
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_unmap(VirtIOIOMMU *s,
+ struct virtio_iommu_req_unmap *req)
+{
+ uint32_t domain_id = le32_to_cpu(req->domain);
+ uint64_t virt_start = le64_to_cpu(req->virt_start);
+ uint64_t virt_end = le64_to_cpu(req->virt_end);
+ VirtIOIOMMUMapping *iter_val;
+ VirtIOIOMMUInterval interval, *iter_key;
+ VirtIOIOMMUDomain *domain;
+ VirtIOIOMMUEndpoint *ep;
+ int ret = VIRTIO_IOMMU_S_OK;
+
+ trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
+
+ domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+ if (!domain) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ if (domain->bypass) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+
+ interval.low = virt_start;
+ interval.high = virt_end;
+
+ while (g_tree_lookup_extended(domain->mappings, &interval,
+ (void **)&iter_key, (void**)&iter_val)) {
+ uint64_t current_low = iter_key->low;
+ uint64_t current_high = iter_key->high;
+
+ if (interval.low <= current_low && interval.high >= current_high) {
+ QLIST_FOREACH(ep, &domain->endpoint_list, next) {
+ virtio_iommu_notify_unmap(ep->iommu_mr, current_low,
+ current_high);
+ }
+ g_tree_remove(domain->mappings, iter_key);
+ trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
+ } else {
+ ret = VIRTIO_IOMMU_S_RANGE;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep,
+ uint8_t *buf, size_t free)
+{
+ struct virtio_iommu_probe_resv_mem prop = {};
+ size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
+ int i;
+
+ total = size * s->nb_reserved_regions;
+
+ if (total > free) {
+ return -ENOSPC;
+ }
+
+ for (i = 0; i < s->nb_reserved_regions; i++) {
+ unsigned subtype = s->reserved_regions[i].type;
+
+ assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
+ subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
+ prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
+ prop.head.length = cpu_to_le16(length);
+ prop.subtype = subtype;
+ prop.start = cpu_to_le64(s->reserved_regions[i].low);
+ prop.end = cpu_to_le64(s->reserved_regions[i].high);
+
+ memcpy(buf, &prop, size);
+
+ trace_virtio_iommu_fill_resv_property(ep, prop.subtype,
+ prop.start, prop.end);
+ buf += size;
+ }
+ return total;
+}
+
+/**
+ * virtio_iommu_probe - Fill the probe request buffer with
+ * the properties the device is able to return
+ */
+static int virtio_iommu_probe(VirtIOIOMMU *s,
+ struct virtio_iommu_req_probe *req,
+ uint8_t *buf)
+{
+ uint32_t ep_id = le32_to_cpu(req->endpoint);
+ size_t free = VIOMMU_PROBE_SIZE;
+ ssize_t count;
+
+ if (!virtio_iommu_mr(s, ep_id)) {
+ return VIRTIO_IOMMU_S_NOENT;
+ }
+
+ count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free);
+ if (count < 0) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+ buf += count;
+ free -= count;
+
+ return VIRTIO_IOMMU_S_OK;
+}
+
+static int virtio_iommu_iov_to_req(struct iovec *iov,
+ unsigned int iov_cnt,
+ void *req, size_t payload_sz)
+{
+ size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
+
+ if (unlikely(sz != payload_sz)) {
+ return VIRTIO_IOMMU_S_INVAL;
+ }
+ return 0;
+}
+
+#define virtio_iommu_handle_req(__req) \
+static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
+ struct iovec *iov, \
+ unsigned int iov_cnt) \
+{ \
+ struct virtio_iommu_req_ ## __req req; \
+ int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \
+ sizeof(req) - sizeof(struct virtio_iommu_req_tail));\
+ \
+ return ret ? ret : virtio_iommu_ ## __req(s, &req); \
+}
+
+virtio_iommu_handle_req(attach)
+virtio_iommu_handle_req(detach)
+virtio_iommu_handle_req(map)
+virtio_iommu_handle_req(unmap)
+
+static int virtio_iommu_handle_probe(VirtIOIOMMU *s,
+ struct iovec *iov,
+ unsigned int iov_cnt,
+ uint8_t *buf)
+{
+ struct virtio_iommu_req_probe req;
+ int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req));
+
+ return ret ? ret : virtio_iommu_probe(s, &req, buf);
+}
+
+static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+ struct virtio_iommu_req_head head;
+ struct virtio_iommu_req_tail tail = {};
+ size_t output_size = sizeof(tail), sz;
+ VirtQueueElement *elem;
+ unsigned int iov_cnt;
+ struct iovec *iov;
+ void *buf = NULL;
+
+ for (;;) {
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ return;
+ }
+
+ if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
+ iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
+ virtio_error(vdev, "virtio-iommu bad head/tail size");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ break;
+ }
+
+ iov_cnt = elem->out_num;
+ iov = elem->out_sg;
+ sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
+ if (unlikely(sz != sizeof(head))) {
+ tail.status = VIRTIO_IOMMU_S_DEVERR;
+ goto out;
+ }
+ qemu_rec_mutex_lock(&s->mutex);
+ switch (head.type) {
+ case VIRTIO_IOMMU_T_ATTACH:
+ tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_DETACH:
+ tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_MAP:
+ tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_UNMAP:
+ tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
+ break;
+ case VIRTIO_IOMMU_T_PROBE:
+ {
+ struct virtio_iommu_req_tail *ptail;
+
+ output_size = s->config.probe_size + sizeof(tail);
+ buf = g_malloc0(output_size);
+
+ ptail = (struct virtio_iommu_req_tail *)
+ (buf + s->config.probe_size);
+ ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf);
+ break;
+ }
+ default:
+ tail.status = VIRTIO_IOMMU_S_UNSUPP;
+ }
+ qemu_rec_mutex_unlock(&s->mutex);
+
+out:
+ sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+ buf ? buf : &tail, output_size);
+ assert(sz == output_size);
+
+ virtqueue_push(vq, elem, sz);
+ virtio_notify(vdev, vq);
+ g_free(elem);
+ g_free(buf);
+ buf = NULL;
+ }
+}
+
+static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
+ int flags, uint32_t endpoint,
+ uint64_t address)
+{
+ VirtIODevice *vdev = &viommu->parent_obj;
+ VirtQueue *vq = viommu->event_vq;
+ struct virtio_iommu_fault fault;
+ VirtQueueElement *elem;
+ size_t sz;
+
+ memset(&fault, 0, sizeof(fault));
+ fault.reason = reason;
+ fault.flags = cpu_to_le32(flags);
+ fault.endpoint = cpu_to_le32(endpoint);
+ fault.address = cpu_to_le64(address);
+
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+
+ if (!elem) {
+ error_report_once(
+ "no buffer available in event queue to report event");
+ return;
+ }
+
+ if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
+ virtio_error(vdev, "error buffer of wrong size");
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ return;
+ }
+
+ sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+ &fault, sizeof(fault));
+ assert(sz == sizeof(fault));
+
+ trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
+ virtqueue_push(vq, elem, sz);
+ virtio_notify(vdev, vq);
+ g_free(elem);
+
+}
+
+static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
+ IOMMUAccessFlags flag,
+ int iommu_idx)
+{
+ IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+ VirtIOIOMMUInterval interval, *mapping_key;
+ VirtIOIOMMUMapping *mapping_value;
+ VirtIOIOMMU *s = sdev->viommu;
+ bool read_fault, write_fault;
+ VirtIOIOMMUEndpoint *ep;
+ uint32_t sid, flags;
+ bool bypass_allowed;
+ bool found;
+ int i;
+
+ interval.low = addr;
+ interval.high = addr + 1;
+
+ IOMMUTLBEntry entry = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = addr,
+ .addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1,
+ .perm = IOMMU_NONE,
+ };
+
+ bypass_allowed = s->config.bypass;
+
+ sid = virtio_iommu_get_bdf(sdev);
+
+ trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
+ qemu_rec_mutex_lock(&s->mutex);
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+
+ if (bypass_allowed)
+ assert(ep && ep->domain && !ep->domain->bypass);
+
+ if (!ep) {
+ if (!bypass_allowed) {
+ error_report_once("%s sid=%d is not known!!", __func__, sid);
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ } else {
+ entry.perm = flag;
+ }
+ goto unlock;
+ }
+
+ for (i = 0; i < s->nb_reserved_regions; i++) {
+ ReservedRegion *reg = &s->reserved_regions[i];
+
+ if (addr >= reg->low && addr <= reg->high) {
+ switch (reg->type) {
+ case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+ entry.perm = flag;
+ break;
+ case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+ default:
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ break;
+ }
+ goto unlock;
+ }
+ }
+
+ if (!ep->domain) {
+ if (!bypass_allowed) {
+ error_report_once("%s %02x:%02x.%01x not attached to any domain",
+ __func__, PCI_BUS_NUM(sid),
+ PCI_SLOT(sid), PCI_FUNC(sid));
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ } else {
+ entry.perm = flag;
+ }
+ goto unlock;
+ } else if (ep->domain->bypass) {
+ entry.perm = flag;
+ goto unlock;
+ }
+
+ found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
+ (void **)&mapping_key,
+ (void **)&mapping_value);
+ if (!found) {
+ error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
+ __func__, addr, sid);
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+ VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ goto unlock;
+ }
+
+ read_fault = (flag & IOMMU_RO) &&
+ !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
+ write_fault = (flag & IOMMU_WO) &&
+ !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
+
+ flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
+ flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
+ if (flags) {
+ error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
+ __func__, addr, flag, mapping_value->flags);
+ flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
+ virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+ flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
+ sid, addr);
+ goto unlock;
+ }
+ entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
+ entry.perm = flag;
+ trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
+
+unlock:
+ qemu_rec_mutex_unlock(&s->mutex);
+ return entry;
+}
+
+static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+ VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+ struct virtio_iommu_config *dev_config = &dev->config;
+ struct virtio_iommu_config *out_config = (void *)config_data;
+
+ out_config->page_size_mask = cpu_to_le64(dev_config->page_size_mask);
+ out_config->input_range.start = cpu_to_le64(dev_config->input_range.start);
+ out_config->input_range.end = cpu_to_le64(dev_config->input_range.end);
+ out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start);
+ out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end);
+ out_config->probe_size = cpu_to_le32(dev_config->probe_size);
+ out_config->bypass = dev_config->bypass;
+
+ trace_virtio_iommu_get_config(dev_config->page_size_mask,
+ dev_config->input_range.start,
+ dev_config->input_range.end,
+ dev_config->domain_range.start,
+ dev_config->domain_range.end,
+ dev_config->probe_size,
+ dev_config->bypass);
+}
+
+static void virtio_iommu_set_config(VirtIODevice *vdev,
+ const uint8_t *config_data)
+{
+ VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+ struct virtio_iommu_config *dev_config = &dev->config;
+ const struct virtio_iommu_config *in_config = (void *)config_data;
+
+ if (in_config->bypass != dev_config->bypass) {
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
+ virtio_error(vdev, "cannot set config.bypass");
+ return;
+ } else if (in_config->bypass != 0 && in_config->bypass != 1) {
+ virtio_error(vdev, "invalid config.bypass value '%u'",
+ in_config->bypass);
+ return;
+ }
+ dev_config->bypass = in_config->bypass;
+ virtio_iommu_switch_address_space_all(dev);
+ }
+
+ trace_virtio_iommu_set_config(in_config->bypass);
+}
+
+static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
+ Error **errp)
+{
+ VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+
+ f |= dev->features;
+ trace_virtio_iommu_get_features(f);
+ return f;
+}
+
+static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+ guint ua = GPOINTER_TO_UINT(a);
+ guint ub = GPOINTER_TO_UINT(b);
+ return (ua > ub) - (ua < ub);
+}
+
+static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data)
+{
+ VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
+ VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
+ IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
+
+ trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high,
+ mapping->phys_addr);
+ virtio_iommu_notify_map(mr, interval->low, interval->high,
+ mapping->phys_addr, mapping->flags);
+ return false;
+}
+
+static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n)
+{
+ IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+ VirtIOIOMMU *s = sdev->viommu;
+ uint32_t sid;
+ VirtIOIOMMUEndpoint *ep;
+
+ sid = virtio_iommu_get_bdf(sdev);
+
+ qemu_rec_mutex_lock(&s->mutex);
+
+ if (!s->endpoints) {
+ goto unlock;
+ }
+
+ ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+ if (!ep || !ep->domain) {
+ goto unlock;
+ }
+
+ g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr);
+
+unlock:
+ qemu_rec_mutex_unlock(&s->mutex);
+}
+
+static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
+ IOMMUNotifierFlag old,
+ IOMMUNotifierFlag new,
+ Error **errp)
+{
+ if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
+ error_setg(errp, "Virtio-iommu does not support dev-iotlb yet");
+ return -EINVAL;
+ }
+
+ if (old == IOMMU_NOTIFIER_NONE) {
+ trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name);
+ } else if (new == IOMMU_NOTIFIER_NONE) {
+ trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name);
+ }
+ return 0;
+}
+
+/*
+ * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule,
+ * for example 0xfffffffffffff000. When an assigned device has page size
+ * restrictions due to the hardware IOMMU configuration, apply this restriction
+ * to the mask.
+ */
+static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
+ uint64_t new_mask,
+ Error **errp)
+{
+ IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+ VirtIOIOMMU *s = sdev->viommu;
+ uint64_t cur_mask = s->config.page_size_mask;
+
+ trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask,
+ new_mask);
+
+ if ((cur_mask & new_mask) == 0) {
+ error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
+ " is incompatible with mask 0x%"PRIx64, cur_mask, new_mask);
+ return -1;
+ }
+
+ /*
+ * After the machine is finalized, we can't change the mask anymore. If by
+ * chance the hotplugged device supports the same granule, we can still
+ * accept it. Having a different masks is possible but the guest will use
+ * sub-optimal block sizes, so warn about it.
+ */
+ if (phase_check(PHASE_MACHINE_READY)) {
+ int new_granule = ctz64(new_mask);
+ int cur_granule = ctz64(cur_mask);
+
+ if (new_granule != cur_granule) {
+ error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
+ " is incompatible with mask 0x%"PRIx64, cur_mask,
+ new_mask);
+ return -1;
+ } else if (new_mask != cur_mask) {
+ warn_report("virtio-iommu page mask 0x%"PRIx64
+ " does not match 0x%"PRIx64, cur_mask, new_mask);
+ }
+ return 0;
+ }
+
+ s->config.page_size_mask &= new_mask;
+ return 0;
+}
+
+static void virtio_iommu_system_reset(void *opaque)
+{
+ VirtIOIOMMU *s = opaque;
+
+ trace_virtio_iommu_system_reset();
+
+ /*
+ * config.bypass is sticky across device reset, but should be restored on
+ * system reset
+ */
+ s->config.bypass = s->boot_bypass;
+ virtio_iommu_switch_address_space_all(s);
+
+}
+
+static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
+
+ virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config));
+
+ memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
+
+ s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
+ virtio_iommu_handle_command);
+ s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
+
+ /*
+ * config.bypass is needed to get initial address space early, such as
+ * in vfio realize
+ */
+ s->config.bypass = s->boot_bypass;
+ s->config.page_size_mask = TARGET_PAGE_MASK;
+ s->config.input_range.end = UINT64_MAX;
+ s->config.domain_range.end = UINT32_MAX;
+ s->config.probe_size = VIOMMU_PROBE_SIZE;
+
+ virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
+ virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
+ virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE);
+ virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG);
+
+ qemu_rec_mutex_init(&s->mutex);
+
+ s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+
+ if (s->primary_bus) {
+ pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s);
+ } else {
+ error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
+ }
+
+ qemu_register_reset(virtio_iommu_system_reset, s);
+}
+
+static void virtio_iommu_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
+
+ qemu_unregister_reset(virtio_iommu_system_reset, s);
+
+ g_hash_table_destroy(s->as_by_busptr);
+ if (s->domains) {
+ g_tree_destroy(s->domains);
+ }
+ if (s->endpoints) {
+ g_tree_destroy(s->endpoints);
+ }
+
+ qemu_rec_mutex_destroy(&s->mutex);
+
+ virtio_delete_queue(s->req_vq);
+ virtio_delete_queue(s->event_vq);
+ virtio_cleanup(vdev);
+}
+
+static void virtio_iommu_device_reset(VirtIODevice *vdev)
+{
+ VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
+
+ trace_virtio_iommu_device_reset();
+
+ if (s->domains) {
+ g_tree_destroy(s->domains);
+ }
+ if (s->endpoints) {
+ g_tree_destroy(s->endpoints);
+ }
+ s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
+ NULL, NULL, virtio_iommu_put_domain);
+ s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
+ NULL, NULL, virtio_iommu_put_endpoint);
+}
+
+static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ trace_virtio_iommu_device_status(status);
+}
+
+static void virtio_iommu_instance_init(Object *obj)
+{
+}
+
+#define VMSTATE_INTERVAL \
+{ \
+ .name = "interval", \
+ .version_id = 1, \
+ .minimum_version_id = 1, \
+ .fields = (VMStateField[]) { \
+ VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
+ VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
+ VMSTATE_END_OF_LIST() \
+ } \
+}
+
+#define VMSTATE_MAPPING \
+{ \
+ .name = "mapping", \
+ .version_id = 1, \
+ .minimum_version_id = 1, \
+ .fields = (VMStateField[]) { \
+ VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
+ VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
+ VMSTATE_END_OF_LIST() \
+ }, \
+}
+
+static const VMStateDescription vmstate_interval_mapping[2] = {
+ VMSTATE_MAPPING, /* value */
+ VMSTATE_INTERVAL /* key */
+};
+
+static int domain_preload(void *opaque)
+{
+ VirtIOIOMMUDomain *domain = opaque;
+
+ domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+ NULL, g_free, g_free);
+ return 0;
+}
+
+static const VMStateDescription vmstate_endpoint = {
+ .name = "endpoint",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_domain = {
+ .name = "domain",
+ .version_id = 2,
+ .minimum_version_id = 2,
+ .pre_load = domain_preload,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(id, VirtIOIOMMUDomain),
+ VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
+ vmstate_interval_mapping,
+ VirtIOIOMMUInterval, VirtIOIOMMUMapping),
+ VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
+ vmstate_endpoint, VirtIOIOMMUEndpoint, next),
+ VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static gboolean reconstruct_endpoints(gpointer key, gpointer value,
+ gpointer data)
+{
+ VirtIOIOMMU *s = (VirtIOIOMMU *)data;
+ VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
+ VirtIOIOMMUEndpoint *iter;
+ IOMMUMemoryRegion *mr;
+
+ QLIST_FOREACH(iter, &d->endpoint_list, next) {
+ mr = virtio_iommu_mr(s, iter->id);
+ assert(mr);
+
+ iter->domain = d;
+ iter->iommu_mr = mr;
+ g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
+ }
+ return false; /* continue the domain traversal */
+}
+
+static int iommu_post_load(void *opaque, int version_id)
+{
+ VirtIOIOMMU *s = opaque;
+
+ g_tree_foreach(s->domains, reconstruct_endpoints, s);
+
+ /*
+ * Memory regions are dynamically turned on/off depending on
+ * 'config.bypass' and attached domain type if there is. After
+ * migration, we need to make sure the memory regions are
+ * still correct.
+ */
+ virtio_iommu_switch_address_space_all(s);
+ return 0;
+}
+
+static const VMStateDescription vmstate_virtio_iommu_device = {
+ .name = "virtio-iommu-device",
+ .minimum_version_id = 2,
+ .version_id = 2,
+ .post_load = iommu_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2,
+ &vmstate_domain, VirtIOIOMMUDomain),
+ VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static const VMStateDescription vmstate_virtio_iommu = {
+ .name = "virtio-iommu",
+ .minimum_version_id = 2,
+ .priority = MIG_PRI_IOMMU,
+ .version_id = 2,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property virtio_iommu_properties[] = {
+ DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *),
+ DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_iommu_properties);
+ dc->vmsd = &vmstate_virtio_iommu;
+
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_iommu_device_realize;
+ vdc->unrealize = virtio_iommu_device_unrealize;
+ vdc->reset = virtio_iommu_device_reset;
+ vdc->get_config = virtio_iommu_get_config;
+ vdc->set_config = virtio_iommu_set_config;
+ vdc->get_features = virtio_iommu_get_features;
+ vdc->set_status = virtio_iommu_set_status;
+ vdc->vmsd = &vmstate_virtio_iommu_device;
+}
+
+static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
+ void *data)
+{
+ IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+ imrc->translate = virtio_iommu_translate;
+ imrc->replay = virtio_iommu_replay;
+ imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
+ imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
+}
+
+static const TypeInfo virtio_iommu_info = {
+ .name = TYPE_VIRTIO_IOMMU,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOIOMMU),
+ .instance_init = virtio_iommu_instance_init,
+ .class_init = virtio_iommu_class_init,
+};
+
+static const TypeInfo virtio_iommu_memory_region_info = {
+ .parent = TYPE_IOMMU_MEMORY_REGION,
+ .name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ .class_init = virtio_iommu_memory_region_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_iommu_info);
+ type_register_static(&virtio_iommu_memory_region_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c
new file mode 100644
index 00000000..5c5c1e3a
--- /dev/null
+++ b/hw/virtio/virtio-mem-pci.c
@@ -0,0 +1,161 @@
+/*
+ * Virtio MEM PCI device
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "virtio-mem-pci.h"
+#include "hw/mem/memory-device.h"
+#include "qapi/error.h"
+#include "qapi/qapi-events-machine.h"
+#include "qapi/qapi-events-misc.h"
+
+static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&mem_pci->vdev);
+
+ virtio_pci_force_virtio_1(vpci_dev);
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
+ Error **errp)
+{
+ object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp);
+}
+
+static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
+ &error_abort);
+}
+
+static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
+ Error **errp)
+{
+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
+
+ return vmc->get_memory_region(vmem, errp);
+}
+
+static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
+ Error **errp)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
+ errp);
+}
+
+static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md,
+ MemoryDeviceInfo *info)
+{
+ VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
+ VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
+ VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
+ VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
+ DeviceState *dev = DEVICE(md);
+
+ if (dev->id) {
+ vi->has_id = true;
+ vi->id = g_strdup(dev->id);
+ }
+
+ /* let the real device handle everything else */
+ vpc->fill_device_info(vmem, vi);
+
+ info->u.virtio_mem.data = vi;
+ info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
+}
+
+static uint64_t virtio_mem_pci_get_min_alignment(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP,
+ &error_abort);
+}
+
+static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data)
+{
+ VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI,
+ size_change_notifier);
+ DeviceState *dev = DEVICE(pci_mem);
+ char *qom_path = object_get_canonical_path(OBJECT(dev));
+ const uint64_t * const size_p = data;
+
+ qapi_event_send_memory_device_size_change(!!dev->id, dev->id, *size_p,
+ qom_path);
+ g_free(qom_path);
+}
+
+static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+
+ k->realize = virtio_mem_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+
+ mdc->get_addr = virtio_mem_pci_get_addr;
+ mdc->set_addr = virtio_mem_pci_set_addr;
+ mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
+ mdc->get_memory_region = virtio_mem_pci_get_memory_region;
+ mdc->fill_device_info = virtio_mem_pci_fill_device_info;
+ mdc->get_min_alignment = virtio_mem_pci_get_min_alignment;
+}
+
+static void virtio_mem_pci_instance_init(Object *obj)
+{
+ VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj);
+ VirtIOMEMClass *vmc;
+ VirtIOMEM *vmem;
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_MEM);
+
+ dev->size_change_notifier.notify = virtio_mem_pci_size_change_notify;
+ vmem = VIRTIO_MEM(&dev->vdev);
+ vmc = VIRTIO_MEM_GET_CLASS(vmem);
+ /*
+ * We never remove the notifier again, as we expect both devices to
+ * disappear at the same time.
+ */
+ vmc->add_size_change_notifier(vmem, &dev->size_change_notifier);
+
+ object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP,
+ OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP);
+ object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev),
+ VIRTIO_MEM_SIZE_PROP);
+ object_property_add_alias(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP,
+ OBJECT(&dev->vdev),
+ VIRTIO_MEM_REQUESTED_SIZE_PROP);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_mem_pci_info = {
+ .base_name = TYPE_VIRTIO_MEM_PCI,
+ .generic_name = "virtio-mem-pci",
+ .instance_size = sizeof(VirtIOMEMPCI),
+ .instance_init = virtio_mem_pci_instance_init,
+ .class_init = virtio_mem_pci_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_MEMORY_DEVICE },
+ { }
+ },
+};
+
+static void virtio_mem_pci_register_types(void)
+{
+ virtio_pci_types_register(&virtio_mem_pci_info);
+}
+type_init(virtio_mem_pci_register_types)
diff --git a/hw/virtio/virtio-mem-pci.h b/hw/virtio/virtio-mem-pci.h
new file mode 100644
index 00000000..e636e1a4
--- /dev/null
+++ b/hw/virtio/virtio-mem-pci.h
@@ -0,0 +1,35 @@
+/*
+ * Virtio MEM PCI device
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_VIRTIO_MEM_PCI_H
+#define QEMU_VIRTIO_MEM_PCI_H
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-mem.h"
+#include "qom/object.h"
+
+typedef struct VirtIOMEMPCI VirtIOMEMPCI;
+
+/*
+ * virtio-mem-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_MEM_PCI "virtio-mem-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOMEMPCI, VIRTIO_MEM_PCI,
+ TYPE_VIRTIO_MEM_PCI)
+
+struct VirtIOMEMPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOMEM vdev;
+ Notifier size_change_notifier;
+};
+
+#endif /* QEMU_VIRTIO_MEM_PCI_H */
diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c
new file mode 100644
index 00000000..ed170def
--- /dev/null
+++ b/hw/virtio/virtio-mem.c
@@ -0,0 +1,1386 @@
+/*
+ * Virtio MEM device
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/units.h"
+#include "sysemu/numa.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/reset.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-mem.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "exec/ram_addr.h"
+#include "migration/misc.h"
+#include "hw/boards.h"
+#include "hw/qdev-properties.h"
+#include CONFIG_DEVICES
+#include "trace.h"
+
+/*
+ * We only had legacy x86 guests that did not support
+ * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
+ */
+#if defined(TARGET_X86_64) || defined(TARGET_I386)
+#define VIRTIO_MEM_HAS_LEGACY_GUESTS
+#endif
+
+/*
+ * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
+ * bitmap small.
+ */
+#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
+
+static uint32_t virtio_mem_default_thp_size(void)
+{
+ uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
+
+#if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
+ default_thp_size = 2 * MiB;
+#elif defined(__aarch64__)
+ if (qemu_real_host_page_size() == 4 * KiB) {
+ default_thp_size = 2 * MiB;
+ } else if (qemu_real_host_page_size() == 16 * KiB) {
+ default_thp_size = 32 * MiB;
+ } else if (qemu_real_host_page_size() == 64 * KiB) {
+ default_thp_size = 512 * MiB;
+ }
+#endif
+
+ return default_thp_size;
+}
+
+/*
+ * We want to have a reasonable default block size such that
+ * 1. We avoid splitting THPs when unplugging memory, which degrades
+ * performance.
+ * 2. We avoid placing THPs for plugged blocks that also cover unplugged
+ * blocks.
+ *
+ * The actual THP size might differ between Linux kernels, so we try to probe
+ * it. In the future (if we ever run into issues regarding 2.), we might want
+ * to disable THP in case we fail to properly probe the THP size, or if the
+ * block size is configured smaller than the THP size.
+ */
+static uint32_t thp_size;
+
+#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+static uint32_t virtio_mem_thp_size(void)
+{
+ gchar *content = NULL;
+ const char *endptr;
+ uint64_t tmp;
+
+ if (thp_size) {
+ return thp_size;
+ }
+
+ /*
+ * Try to probe the actual THP size, fallback to (sane but eventually
+ * incorrect) default sizes.
+ */
+ if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
+ !qemu_strtou64(content, &endptr, 0, &tmp) &&
+ (!endptr || *endptr == '\n')) {
+ /* Sanity-check the value and fallback to something reasonable. */
+ if (!tmp || !is_power_of_2(tmp)) {
+ warn_report("Read unsupported THP size: %" PRIx64, tmp);
+ } else {
+ thp_size = tmp;
+ }
+ }
+
+ if (!thp_size) {
+ thp_size = virtio_mem_default_thp_size();
+ warn_report("Could not detect THP size, falling back to %" PRIx64
+ " MiB.", thp_size / MiB);
+ }
+
+ g_free(content);
+ return thp_size;
+}
+
+static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
+{
+ const uint64_t page_size = qemu_ram_pagesize(rb);
+
+ /* We can have hugetlbfs with a page size smaller than the THP size. */
+ if (page_size == qemu_real_host_page_size()) {
+ return MAX(page_size, virtio_mem_thp_size());
+ }
+ return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
+}
+
+#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
+static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
+{
+ /*
+ * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
+ * anonymous RAM. In any other case, reading unplugged *can* populate a
+ * fresh page, consuming actual memory.
+ */
+ return !qemu_ram_is_shared(rb) && rb->fd < 0 &&
+ qemu_ram_pagesize(rb) == qemu_real_host_page_size();
+}
+#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
+
+/*
+ * Size the usable region bigger than the requested size if possible. Esp.
+ * Linux guests will only add (aligned) memory blocks in case they fully
+ * fit into the usable region, but plug+online only a subset of the pages.
+ * The memory block size corresponds mostly to the section size.
+ *
+ * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
+ * a section size of 512MB on arm64 (as long as the start address is properly
+ * aligned, similar to ordinary DIMMs).
+ *
+ * We can change this at any time and maybe even make it configurable if
+ * necessary (as the section size can change). But it's more likely that the
+ * section size will rather get smaller and not bigger over time.
+ */
+#if defined(TARGET_X86_64) || defined(TARGET_I386)
+#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
+#elif defined(TARGET_ARM)
+#define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
+#else
+#error VIRTIO_MEM_USABLE_EXTENT not defined
+#endif
+
+static bool virtio_mem_is_busy(void)
+{
+ /*
+ * Postcopy cannot handle concurrent discards and we don't want to migrate
+ * pages on-demand with stale content when plugging new blocks.
+ *
+ * For precopy, we don't want unplugged blocks in our migration stream, and
+ * when plugging new blocks, the page content might differ between source
+ * and destination (observable by the guest when not initializing pages
+ * after plugging them) until we're running on the destination (as we didn't
+ * migrate these blocks when they were unplugged).
+ */
+ return migration_in_incoming_postcopy() || !migration_is_idle();
+}
+
+typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg,
+ uint64_t offset, uint64_t size);
+
+static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
+ virtio_mem_range_cb cb)
+{
+ unsigned long first_zero_bit, last_zero_bit;
+ uint64_t offset, size;
+ int ret = 0;
+
+ first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
+ while (first_zero_bit < vmem->bitmap_size) {
+ offset = first_zero_bit * vmem->block_size;
+ last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+ first_zero_bit + 1) - 1;
+ size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
+
+ ret = cb(vmem, arg, offset, size);
+ if (ret) {
+ break;
+ }
+ first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+ last_zero_bit + 2);
+ }
+ return ret;
+}
+
+/*
+ * Adjust the memory section to cover the intersection with the given range.
+ *
+ * Returns false if the intersection is empty, otherwise returns true.
+ */
+static bool virito_mem_intersect_memory_section(MemoryRegionSection *s,
+ uint64_t offset, uint64_t size)
+{
+ uint64_t start = MAX(s->offset_within_region, offset);
+ uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
+ offset + size);
+
+ if (end <= start) {
+ return false;
+ }
+
+ s->offset_within_address_space += start - s->offset_within_region;
+ s->offset_within_region = start;
+ s->size = int128_make64(end - start);
+ return true;
+}
+
+typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
+
+static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
+ MemoryRegionSection *s,
+ void *arg,
+ virtio_mem_section_cb cb)
+{
+ unsigned long first_bit, last_bit;
+ uint64_t offset, size;
+ int ret = 0;
+
+ first_bit = s->offset_within_region / vmem->bitmap_size;
+ first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
+ while (first_bit < vmem->bitmap_size) {
+ MemoryRegionSection tmp = *s;
+
+ offset = first_bit * vmem->block_size;
+ last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+ first_bit + 1) - 1;
+ size = (last_bit - first_bit + 1) * vmem->block_size;
+
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ break;
+ }
+ ret = cb(&tmp, arg);
+ if (ret) {
+ break;
+ }
+ first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+ last_bit + 2);
+ }
+ return ret;
+}
+
+static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
+ MemoryRegionSection *s,
+ void *arg,
+ virtio_mem_section_cb cb)
+{
+ unsigned long first_bit, last_bit;
+ uint64_t offset, size;
+ int ret = 0;
+
+ first_bit = s->offset_within_region / vmem->bitmap_size;
+ first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
+ while (first_bit < vmem->bitmap_size) {
+ MemoryRegionSection tmp = *s;
+
+ offset = first_bit * vmem->block_size;
+ last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
+ first_bit + 1) - 1;
+ size = (last_bit - first_bit + 1) * vmem->block_size;
+
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ break;
+ }
+ ret = cb(&tmp, arg);
+ if (ret) {
+ break;
+ }
+ first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
+ last_bit + 2);
+ }
+ return ret;
+}
+
+static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
+{
+ RamDiscardListener *rdl = arg;
+
+ return rdl->notify_populate(rdl, s);
+}
+
+static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
+{
+ RamDiscardListener *rdl = arg;
+
+ rdl->notify_discard(rdl, s);
+ return 0;
+}
+
+static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
+ uint64_t size)
+{
+ RamDiscardListener *rdl;
+
+ QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
+ MemoryRegionSection tmp = *rdl->section;
+
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ continue;
+ }
+ rdl->notify_discard(rdl, &tmp);
+ }
+}
+
+static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
+ uint64_t size)
+{
+ RamDiscardListener *rdl, *rdl2;
+ int ret = 0;
+
+ QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
+ MemoryRegionSection tmp = *rdl->section;
+
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ continue;
+ }
+ ret = rdl->notify_populate(rdl, &tmp);
+ if (ret) {
+ break;
+ }
+ }
+
+ if (ret) {
+ /* Notify all already-notified listeners. */
+ QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
+ MemoryRegionSection tmp = *rdl->section;
+
+ if (rdl2 == rdl) {
+ break;
+ }
+ if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
+ continue;
+ }
+ rdl2->notify_discard(rdl2, &tmp);
+ }
+ }
+ return ret;
+}
+
+static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
+{
+ RamDiscardListener *rdl;
+
+ if (!vmem->size) {
+ return;
+ }
+
+ QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
+ if (rdl->double_discard_supported) {
+ rdl->notify_discard(rdl, rdl->section);
+ } else {
+ virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
+ virtio_mem_notify_discard_cb);
+ }
+ }
+}
+
+static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa,
+ uint64_t size, bool plugged)
+{
+ const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
+ const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
+ unsigned long found_bit;
+
+ /* We fake a shorter bitmap to avoid searching too far. */
+ if (plugged) {
+ found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
+ } else {
+ found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
+ }
+ return found_bit > last_bit;
+}
+
+static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa,
+ uint64_t size, bool plugged)
+{
+ const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
+ const unsigned long nbits = size / vmem->block_size;
+
+ if (plugged) {
+ bitmap_set(vmem->bitmap, bit, nbits);
+ } else {
+ bitmap_clear(vmem->bitmap, bit, nbits);
+ }
+}
+
+static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
+ struct virtio_mem_resp *resp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
+ VirtQueue *vq = vmem->vq;
+
+ trace_virtio_mem_send_response(le16_to_cpu(resp->type));
+ iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
+
+ virtqueue_push(vq, elem, sizeof(*resp));
+ virtio_notify(vdev, vq);
+}
+
+static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
+ VirtQueueElement *elem,
+ uint16_t type)
+{
+ struct virtio_mem_resp resp = {
+ .type = cpu_to_le16(type),
+ };
+
+ virtio_mem_send_response(vmem, elem, &resp);
+}
+
+static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
+ uint64_t size)
+{
+ if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
+ return false;
+ }
+ if (gpa + size < gpa || !size) {
+ return false;
+ }
+ if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
+ return false;
+ }
+ if (gpa + size > vmem->addr + vmem->usable_region_size) {
+ return false;
+ }
+ return true;
+}
+
+static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
+ uint64_t size, bool plug)
+{
+ const uint64_t offset = start_gpa - vmem->addr;
+ RAMBlock *rb = vmem->memdev->mr.ram_block;
+
+ if (virtio_mem_is_busy()) {
+ return -EBUSY;
+ }
+
+ if (!plug) {
+ if (ram_block_discard_range(rb, offset, size)) {
+ return -EBUSY;
+ }
+ virtio_mem_notify_unplug(vmem, offset, size);
+ } else {
+ int ret = 0;
+
+ if (vmem->prealloc) {
+ void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
+ int fd = memory_region_get_fd(&vmem->memdev->mr);
+ Error *local_err = NULL;
+
+ qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err);
+ if (local_err) {
+ static bool warned;
+
+ /*
+ * Warn only once, we don't want to fill the log with these
+ * warnings.
+ */
+ if (!warned) {
+ warn_report_err(local_err);
+ warned = true;
+ } else {
+ error_free(local_err);
+ }
+ ret = -EBUSY;
+ }
+ }
+ if (!ret) {
+ ret = virtio_mem_notify_plug(vmem, offset, size);
+ }
+
+ if (ret) {
+ /* Could be preallocation or a notifier populated memory. */
+ ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
+ return -EBUSY;
+ }
+ }
+ virtio_mem_set_bitmap(vmem, start_gpa, size, plug);
+ return 0;
+}
+
+static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
+ uint16_t nb_blocks, bool plug)
+{
+ const uint64_t size = nb_blocks * vmem->block_size;
+ int ret;
+
+ if (!virtio_mem_valid_range(vmem, gpa, size)) {
+ return VIRTIO_MEM_RESP_ERROR;
+ }
+
+ if (plug && (vmem->size + size > vmem->requested_size)) {
+ return VIRTIO_MEM_RESP_NACK;
+ }
+
+ /* test if really all blocks are in the opposite state */
+ if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) {
+ return VIRTIO_MEM_RESP_ERROR;
+ }
+
+ ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
+ if (ret) {
+ return VIRTIO_MEM_RESP_BUSY;
+ }
+ if (plug) {
+ vmem->size += size;
+ } else {
+ vmem->size -= size;
+ }
+ notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
+ return VIRTIO_MEM_RESP_ACK;
+}
+
+static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
+ struct virtio_mem_req *req)
+{
+ const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
+ const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
+ uint16_t type;
+
+ trace_virtio_mem_plug_request(gpa, nb_blocks);
+ type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
+ virtio_mem_send_response_simple(vmem, elem, type);
+}
+
+static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
+ struct virtio_mem_req *req)
+{
+ const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
+ const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
+ uint16_t type;
+
+ trace_virtio_mem_unplug_request(gpa, nb_blocks);
+ type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
+ virtio_mem_send_response_simple(vmem, elem, type);
+}
+
+static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
+ uint64_t requested_size,
+ bool can_shrink)
+{
+ uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
+ requested_size + VIRTIO_MEM_USABLE_EXTENT);
+
+ /* The usable region size always has to be multiples of the block size. */
+ newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
+
+ if (!requested_size) {
+ newsize = 0;
+ }
+
+ if (newsize < vmem->usable_region_size && !can_shrink) {
+ return;
+ }
+
+ trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
+ vmem->usable_region_size = newsize;
+}
+
+static int virtio_mem_unplug_all(VirtIOMEM *vmem)
+{
+ RAMBlock *rb = vmem->memdev->mr.ram_block;
+
+ if (virtio_mem_is_busy()) {
+ return -EBUSY;
+ }
+
+ if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
+ return -EBUSY;
+ }
+ virtio_mem_notify_unplug_all(vmem);
+
+ bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
+ if (vmem->size) {
+ vmem->size = 0;
+ notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
+ }
+ trace_virtio_mem_unplugged_all();
+ virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
+ return 0;
+}
+
+static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
+ VirtQueueElement *elem)
+{
+ trace_virtio_mem_unplug_all_request();
+ if (virtio_mem_unplug_all(vmem)) {
+ virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
+ } else {
+ virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
+ }
+}
+
+static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
+ struct virtio_mem_req *req)
+{
+ const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
+ const uint64_t gpa = le64_to_cpu(req->u.state.addr);
+ const uint64_t size = nb_blocks * vmem->block_size;
+ struct virtio_mem_resp resp = {
+ .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
+ };
+
+ trace_virtio_mem_state_request(gpa, nb_blocks);
+ if (!virtio_mem_valid_range(vmem, gpa, size)) {
+ virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
+ return;
+ }
+
+ if (virtio_mem_test_bitmap(vmem, gpa, size, true)) {
+ resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
+ } else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) {
+ resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
+ } else {
+ resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
+ }
+ trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
+ virtio_mem_send_response(vmem, elem, &resp);
+}
+
+static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
+{
+ const int len = sizeof(struct virtio_mem_req);
+ VirtIOMEM *vmem = VIRTIO_MEM(vdev);
+ VirtQueueElement *elem;
+ struct virtio_mem_req req;
+ uint16_t type;
+
+ while (true) {
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ return;
+ }
+
+ if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
+ virtio_error(vdev, "virtio-mem protocol violation: invalid request"
+ " size: %d", len);
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ return;
+ }
+
+ if (iov_size(elem->in_sg, elem->in_num) <
+ sizeof(struct virtio_mem_resp)) {
+ virtio_error(vdev, "virtio-mem protocol violation: not enough space"
+ " for response: %zu",
+ iov_size(elem->in_sg, elem->in_num));
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ return;
+ }
+
+ type = le16_to_cpu(req.type);
+ switch (type) {
+ case VIRTIO_MEM_REQ_PLUG:
+ virtio_mem_plug_request(vmem, elem, &req);
+ break;
+ case VIRTIO_MEM_REQ_UNPLUG:
+ virtio_mem_unplug_request(vmem, elem, &req);
+ break;
+ case VIRTIO_MEM_REQ_UNPLUG_ALL:
+ virtio_mem_unplug_all_request(vmem, elem);
+ break;
+ case VIRTIO_MEM_REQ_STATE:
+ virtio_mem_state_request(vmem, elem, &req);
+ break;
+ default:
+ virtio_error(vdev, "virtio-mem protocol violation: unknown request"
+ " type: %d", type);
+ virtqueue_detach_element(vq, elem, 0);
+ g_free(elem);
+ return;
+ }
+
+ g_free(elem);
+ }
+}
+
+static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(vdev);
+ struct virtio_mem_config *config = (void *) config_data;
+
+ config->block_size = cpu_to_le64(vmem->block_size);
+ config->node_id = cpu_to_le16(vmem->node);
+ config->requested_size = cpu_to_le64(vmem->requested_size);
+ config->plugged_size = cpu_to_le64(vmem->size);
+ config->addr = cpu_to_le64(vmem->addr);
+ config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
+ config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
+}
+
+static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
+ Error **errp)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ VirtIOMEM *vmem = VIRTIO_MEM(vdev);
+
+ if (ms->numa_state) {
+#if defined(CONFIG_ACPI)
+ virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
+#endif
+ }
+ assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
+ if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
+ virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
+ }
+ return features;
+}
+
+static int virtio_mem_validate_features(VirtIODevice *vdev)
+{
+ if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
+ !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static void virtio_mem_system_reset(void *opaque)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+
+ /*
+ * During usual resets, we will unplug all memory and shrink the usable
+ * region size. This is, however, not possible in all scenarios. Then,
+ * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
+ */
+ virtio_mem_unplug_all(vmem);
+}
+
+static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOMEM *vmem = VIRTIO_MEM(dev);
+ uint64_t page_size;
+ RAMBlock *rb;
+ int ret;
+
+ if (!vmem->memdev) {
+ error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
+ return;
+ } else if (host_memory_backend_is_mapped(vmem->memdev)) {
+ error_setg(errp, "'%s' property specifies a busy memdev: %s",
+ VIRTIO_MEM_MEMDEV_PROP,
+ object_get_canonical_path_component(OBJECT(vmem->memdev)));
+ return;
+ } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
+ memory_region_is_rom(&vmem->memdev->mr) ||
+ !vmem->memdev->mr.ram_block) {
+ error_setg(errp, "'%s' property specifies an unsupported memdev",
+ VIRTIO_MEM_MEMDEV_PROP);
+ return;
+ }
+
+ if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
+ (!nb_numa_nodes && vmem->node)) {
+ error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
+ "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
+ vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
+ return;
+ }
+
+ if (enable_mlock) {
+ error_setg(errp, "Incompatible with mlock");
+ return;
+ }
+
+ rb = vmem->memdev->mr.ram_block;
+ page_size = qemu_ram_pagesize(rb);
+
+#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
+ switch (vmem->unplugged_inaccessible) {
+ case ON_OFF_AUTO_AUTO:
+ if (virtio_mem_has_shared_zeropage(rb)) {
+ vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
+ } else {
+ vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
+ }
+ break;
+ case ON_OFF_AUTO_OFF:
+ if (!virtio_mem_has_shared_zeropage(rb)) {
+ warn_report("'%s' property set to 'off' with a memdev that does"
+ " not support the shared zeropage.",
+ VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
+ }
+ break;
+ default:
+ break;
+ }
+#else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
+ vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
+#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
+
+ /*
+ * If the block size wasn't configured by the user, use a sane default. This
+ * allows using hugetlbfs backends of any page size without manual
+ * intervention.
+ */
+ if (!vmem->block_size) {
+ vmem->block_size = virtio_mem_default_block_size(rb);
+ }
+
+ if (vmem->block_size < page_size) {
+ error_setg(errp, "'%s' property has to be at least the page size (0x%"
+ PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
+ return;
+ } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
+ warn_report("'%s' property is smaller than the default block size (%"
+ PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
+ virtio_mem_default_block_size(rb) / MiB);
+ }
+ if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
+ error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
+ ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
+ VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
+ return;
+ } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
+ error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
+ ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
+ vmem->block_size);
+ return;
+ } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
+ vmem->block_size)) {
+ error_setg(errp, "'%s' property memdev size has to be multiples of"
+ "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
+ VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
+ return;
+ }
+
+ if (ram_block_coordinated_discard_require(true)) {
+ error_setg(errp, "Discarding RAM is disabled");
+ return;
+ }
+
+ ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
+ if (ret) {
+ error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
+ ram_block_coordinated_discard_require(false);
+ return;
+ }
+
+ virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
+
+ vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
+ vmem->block_size;
+ vmem->bitmap = bitmap_new(vmem->bitmap_size);
+
+ virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
+ vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
+
+ host_memory_backend_set_mapped(vmem->memdev, true);
+ vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
+ qemu_register_reset(virtio_mem_system_reset, vmem);
+
+ /*
+ * Set ourselves as RamDiscardManager before the plug handler maps the
+ * memory region and exposes it via an address space.
+ */
+ memory_region_set_ram_discard_manager(&vmem->memdev->mr,
+ RAM_DISCARD_MANAGER(vmem));
+}
+
+static void virtio_mem_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOMEM *vmem = VIRTIO_MEM(dev);
+
+ /*
+ * The unplug handler unmapped the memory region, it cannot be
+ * found via an address space anymore. Unset ourselves.
+ */
+ memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
+ qemu_unregister_reset(virtio_mem_system_reset, vmem);
+ vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
+ host_memory_backend_set_mapped(vmem->memdev, false);
+ virtio_del_queue(vdev, 0);
+ virtio_cleanup(vdev);
+ g_free(vmem->bitmap);
+ ram_block_coordinated_discard_require(false);
+}
+
+static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg,
+ uint64_t offset, uint64_t size)
+{
+ RAMBlock *rb = vmem->memdev->mr.ram_block;
+
+ return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
+}
+
+static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
+{
+ /* Make sure all memory is really discarded after migration. */
+ return virtio_mem_for_each_unplugged_range(vmem, NULL,
+ virtio_mem_discard_range_cb);
+}
+
+static int virtio_mem_post_load(void *opaque, int version_id)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(opaque);
+ RamDiscardListener *rdl;
+ int ret;
+
+ /*
+ * We started out with all memory discarded and our memory region is mapped
+ * into an address space. Replay, now that we updated the bitmap.
+ */
+ QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
+ ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
+ virtio_mem_notify_populate_cb);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ if (migration_in_incoming_postcopy()) {
+ return 0;
+ }
+
+ return virtio_mem_restore_unplugged(vmem);
+}
+
+typedef struct VirtIOMEMMigSanityChecks {
+ VirtIOMEM *parent;
+ uint64_t addr;
+ uint64_t region_size;
+ uint64_t block_size;
+ uint32_t node;
+} VirtIOMEMMigSanityChecks;
+
+static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
+{
+ VirtIOMEMMigSanityChecks *tmp = opaque;
+ VirtIOMEM *vmem = tmp->parent;
+
+ tmp->addr = vmem->addr;
+ tmp->region_size = memory_region_size(&vmem->memdev->mr);
+ tmp->block_size = vmem->block_size;
+ tmp->node = vmem->node;
+ return 0;
+}
+
+static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
+{
+ VirtIOMEMMigSanityChecks *tmp = opaque;
+ VirtIOMEM *vmem = tmp->parent;
+ const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
+
+ if (tmp->addr != vmem->addr) {
+ error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
+ VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
+ return -EINVAL;
+ }
+ /*
+ * Note: Preparation for resizeable memory regions. The maximum size
+ * of the memory region must not change during migration.
+ */
+ if (tmp->region_size != new_region_size) {
+ error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
+ PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
+ new_region_size);
+ return -EINVAL;
+ }
+ if (tmp->block_size != vmem->block_size) {
+ error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
+ VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
+ vmem->block_size);
+ return -EINVAL;
+ }
+ if (tmp->node != vmem->node) {
+ error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
+ VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
+ .name = "virtio-mem-device/sanity-checks",
+ .pre_save = virtio_mem_mig_sanity_checks_pre_save,
+ .post_load = virtio_mem_mig_sanity_checks_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
+ VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
+ VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
+ VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
+ VMSTATE_END_OF_LIST(),
+ },
+};
+
+static const VMStateDescription vmstate_virtio_mem_device = {
+ .name = "virtio-mem-device",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .priority = MIG_PRI_VIRTIO_MEM,
+ .post_load = virtio_mem_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
+ vmstate_virtio_mem_sanity_checks),
+ VMSTATE_UINT64(usable_region_size, VirtIOMEM),
+ VMSTATE_UINT64(size, VirtIOMEM),
+ VMSTATE_UINT64(requested_size, VirtIOMEM),
+ VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static const VMStateDescription vmstate_virtio_mem = {
+ .name = "virtio-mem",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
+ VirtioMEMDeviceInfo *vi)
+{
+ vi->memaddr = vmem->addr;
+ vi->node = vmem->node;
+ vi->requested_size = vmem->requested_size;
+ vi->size = vmem->size;
+ vi->max_size = memory_region_size(&vmem->memdev->mr);
+ vi->block_size = vmem->block_size;
+ vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
+}
+
+static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
+{
+ if (!vmem->memdev) {
+ error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
+ return NULL;
+ }
+
+ return &vmem->memdev->mr;
+}
+
+static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
+ Notifier *notifier)
+{
+ notifier_list_add(&vmem->size_change_notifiers, notifier);
+}
+
+static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
+ Notifier *notifier)
+{
+ notifier_remove(notifier);
+}
+
+static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(obj);
+ uint64_t value = vmem->size;
+
+ visit_type_size(v, name, &value, errp);
+}
+
+static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(obj);
+ uint64_t value = vmem->requested_size;
+
+ visit_type_size(v, name, &value, errp);
+}
+
+static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(obj);
+ Error *err = NULL;
+ uint64_t value;
+
+ visit_type_size(v, name, &value, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ /*
+ * The block size and memory backend are not fixed until the device was
+ * realized. realize() will verify these properties then.
+ */
+ if (DEVICE(obj)->realized) {
+ if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
+ error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
+ ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
+ vmem->block_size);
+ return;
+ } else if (value > memory_region_size(&vmem->memdev->mr)) {
+ error_setg(errp, "'%s' cannot exceed the memory backend size"
+ "(0x%" PRIx64 ")", name,
+ memory_region_size(&vmem->memdev->mr));
+ return;
+ }
+
+ if (value != vmem->requested_size) {
+ virtio_mem_resize_usable_region(vmem, value, false);
+ vmem->requested_size = value;
+ }
+ /*
+ * Trigger a config update so the guest gets notified. We trigger
+ * even if the size didn't change (especially helpful for debugging).
+ */
+ virtio_notify_config(VIRTIO_DEVICE(vmem));
+ } else {
+ vmem->requested_size = value;
+ }
+}
+
+static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(obj);
+ uint64_t value = vmem->block_size;
+
+ /*
+ * If not configured by the user (and we're not realized yet), use the
+ * default block size we would use with the current memory backend.
+ */
+ if (!value) {
+ if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
+ value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
+ } else {
+ value = virtio_mem_thp_size();
+ }
+ }
+
+ visit_type_size(v, name, &value, errp);
+}
+
+static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(obj);
+ Error *err = NULL;
+ uint64_t value;
+
+ if (DEVICE(obj)->realized) {
+ error_setg(errp, "'%s' cannot be changed", name);
+ return;
+ }
+
+ visit_type_size(v, name, &value, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
+ error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
+ VIRTIO_MEM_MIN_BLOCK_SIZE);
+ return;
+ } else if (!is_power_of_2(value)) {
+ error_setg(errp, "'%s' property has to be a power of two", name);
+ return;
+ }
+ vmem->block_size = value;
+}
+
+static void virtio_mem_instance_init(Object *obj)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(obj);
+
+ notifier_list_init(&vmem->size_change_notifiers);
+ QLIST_INIT(&vmem->rdl_list);
+
+ object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
+ NULL, NULL, NULL);
+ object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
+ virtio_mem_get_requested_size,
+ virtio_mem_set_requested_size, NULL, NULL);
+ object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
+ virtio_mem_get_block_size, virtio_mem_set_block_size,
+ NULL, NULL);
+}
+
+static Property virtio_mem_properties[] = {
+ DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
+ DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
+ DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
+ DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
+ TYPE_MEMORY_BACKEND, HostMemoryBackend *),
+#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
+ DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
+ unplugged_inaccessible, ON_OFF_AUTO_AUTO),
+#endif
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
+ const MemoryRegion *mr)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+
+ g_assert(mr == &vmem->memdev->mr);
+ return vmem->block_size;
+}
+
+static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
+ const MemoryRegionSection *s)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+ uint64_t start_gpa = vmem->addr + s->offset_within_region;
+ uint64_t end_gpa = start_gpa + int128_get64(s->size);
+
+ g_assert(s->mr == &vmem->memdev->mr);
+
+ start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
+ end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
+
+ if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
+ return false;
+ }
+
+ return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true);
+}
+
+struct VirtIOMEMReplayData {
+ void *fn;
+ void *opaque;
+};
+
+static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
+{
+ struct VirtIOMEMReplayData *data = arg;
+
+ return ((ReplayRamPopulate)data->fn)(s, data->opaque);
+}
+
+static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
+ MemoryRegionSection *s,
+ ReplayRamPopulate replay_fn,
+ void *opaque)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+ struct VirtIOMEMReplayData data = {
+ .fn = replay_fn,
+ .opaque = opaque,
+ };
+
+ g_assert(s->mr == &vmem->memdev->mr);
+ return virtio_mem_for_each_plugged_section(vmem, s, &data,
+ virtio_mem_rdm_replay_populated_cb);
+}
+
+static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
+ void *arg)
+{
+ struct VirtIOMEMReplayData *data = arg;
+
+ ((ReplayRamDiscard)data->fn)(s, data->opaque);
+ return 0;
+}
+
+static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
+ MemoryRegionSection *s,
+ ReplayRamDiscard replay_fn,
+ void *opaque)
+{
+ const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+ struct VirtIOMEMReplayData data = {
+ .fn = replay_fn,
+ .opaque = opaque,
+ };
+
+ g_assert(s->mr == &vmem->memdev->mr);
+ virtio_mem_for_each_unplugged_section(vmem, s, &data,
+ virtio_mem_rdm_replay_discarded_cb);
+}
+
+static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
+ RamDiscardListener *rdl,
+ MemoryRegionSection *s)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+ int ret;
+
+ g_assert(s->mr == &vmem->memdev->mr);
+ rdl->section = memory_region_section_new_copy(s);
+
+ QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
+ ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
+ virtio_mem_notify_populate_cb);
+ if (ret) {
+ error_report("%s: Replaying plugged ranges failed: %s", __func__,
+ strerror(-ret));
+ }
+}
+
+static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
+ RamDiscardListener *rdl)
+{
+ VirtIOMEM *vmem = VIRTIO_MEM(rdm);
+
+ g_assert(rdl->section->mr == &vmem->memdev->mr);
+ if (vmem->size) {
+ if (rdl->double_discard_supported) {
+ rdl->notify_discard(rdl, rdl->section);
+ } else {
+ virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
+ virtio_mem_notify_discard_cb);
+ }
+ }
+
+ memory_region_section_free_copy(rdl->section);
+ rdl->section = NULL;
+ QLIST_REMOVE(rdl, next);
+}
+
+static void virtio_mem_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+ VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
+ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
+
+ device_class_set_props(dc, virtio_mem_properties);
+ dc->vmsd = &vmstate_virtio_mem;
+
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_mem_device_realize;
+ vdc->unrealize = virtio_mem_device_unrealize;
+ vdc->get_config = virtio_mem_get_config;
+ vdc->get_features = virtio_mem_get_features;
+ vdc->validate_features = virtio_mem_validate_features;
+ vdc->vmsd = &vmstate_virtio_mem_device;
+
+ vmc->fill_device_info = virtio_mem_fill_device_info;
+ vmc->get_memory_region = virtio_mem_get_memory_region;
+ vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
+ vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
+
+ rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
+ rdmc->is_populated = virtio_mem_rdm_is_populated;
+ rdmc->replay_populated = virtio_mem_rdm_replay_populated;
+ rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
+ rdmc->register_listener = virtio_mem_rdm_register_listener;
+ rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
+}
+
+static const TypeInfo virtio_mem_info = {
+ .name = TYPE_VIRTIO_MEM,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOMEM),
+ .instance_init = virtio_mem_instance_init,
+ .class_init = virtio_mem_class_init,
+ .class_size = sizeof(VirtIOMEMClass),
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_RAM_DISCARD_MANAGER },
+ { }
+ },
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_mem_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
new file mode 100644
index 00000000..d240efef
--- /dev/null
+++ b/hw/virtio/virtio-mmio.c
@@ -0,0 +1,862 @@
+/*
+ * Virtio MMIO bindings
+ *
+ * Copyright (c) 2011 Linaro Limited
+ *
+ * Author:
+ * Peter Maydell <peter.maydell@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "standard-headers/linux/virtio_mmio.h"
+#include "hw/irq.h"
+#include "hw/qdev-properties.h"
+#include "hw/sysbus.h"
+#include "hw/virtio/virtio.h"
+#include "migration/qemu-file-types.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "sysemu/kvm.h"
+#include "sysemu/replay.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "trace.h"
+
+static bool virtio_mmio_ioeventfd_enabled(DeviceState *d)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+
+ return (proxy->flags & VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD) != 0;
+}
+
+static int virtio_mmio_ioeventfd_assign(DeviceState *d,
+ EventNotifier *notifier,
+ int n, bool assign)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+
+ if (assign) {
+ memory_region_add_eventfd(&proxy->iomem, VIRTIO_MMIO_QUEUE_NOTIFY, 4,
+ true, n, notifier);
+ } else {
+ memory_region_del_eventfd(&proxy->iomem, VIRTIO_MMIO_QUEUE_NOTIFY, 4,
+ true, n, notifier);
+ }
+ return 0;
+}
+
+static void virtio_mmio_start_ioeventfd(VirtIOMMIOProxy *proxy)
+{
+ virtio_bus_start_ioeventfd(&proxy->bus);
+}
+
+static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy)
+{
+ virtio_bus_stop_ioeventfd(&proxy->bus);
+}
+
+static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy)
+{
+ int i;
+
+ virtio_bus_reset(&proxy->bus);
+
+ if (!proxy->legacy) {
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ proxy->vqs[i].enabled = 0;
+ }
+ }
+}
+
+static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size)
+{
+ VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ trace_virtio_mmio_read(offset);
+
+ if (!vdev) {
+ /* If no backend is present, we treat most registers as
+ * read-as-zero, except for the magic number, version and
+ * vendor ID. This is not strictly sanctioned by the virtio
+ * spec, but it allows us to provide transports with no backend
+ * plugged in which don't confuse Linux's virtio code: the
+ * probe won't complain about the bad magic number, but the
+ * device ID of zero means no backend will claim it.
+ */
+ switch (offset) {
+ case VIRTIO_MMIO_MAGIC_VALUE:
+ return VIRT_MAGIC;
+ case VIRTIO_MMIO_VERSION:
+ if (proxy->legacy) {
+ return VIRT_VERSION_LEGACY;
+ } else {
+ return VIRT_VERSION;
+ }
+ case VIRTIO_MMIO_VENDOR_ID:
+ return VIRT_VENDOR;
+ default:
+ return 0;
+ }
+ }
+
+ if (offset >= VIRTIO_MMIO_CONFIG) {
+ offset -= VIRTIO_MMIO_CONFIG;
+ if (proxy->legacy) {
+ switch (size) {
+ case 1:
+ return virtio_config_readb(vdev, offset);
+ case 2:
+ return virtio_config_readw(vdev, offset);
+ case 4:
+ return virtio_config_readl(vdev, offset);
+ default:
+ abort();
+ }
+ } else {
+ switch (size) {
+ case 1:
+ return virtio_config_modern_readb(vdev, offset);
+ case 2:
+ return virtio_config_modern_readw(vdev, offset);
+ case 4:
+ return virtio_config_modern_readl(vdev, offset);
+ default:
+ abort();
+ }
+ }
+ }
+ if (size != 4) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: wrong size access to register!\n",
+ __func__);
+ return 0;
+ }
+ switch (offset) {
+ case VIRTIO_MMIO_MAGIC_VALUE:
+ return VIRT_MAGIC;
+ case VIRTIO_MMIO_VERSION:
+ if (proxy->legacy) {
+ return VIRT_VERSION_LEGACY;
+ } else {
+ return VIRT_VERSION;
+ }
+ case VIRTIO_MMIO_DEVICE_ID:
+ return vdev->device_id;
+ case VIRTIO_MMIO_VENDOR_ID:
+ return VIRT_VENDOR;
+ case VIRTIO_MMIO_DEVICE_FEATURES:
+ if (proxy->legacy) {
+ if (proxy->host_features_sel) {
+ return 0;
+ } else {
+ return vdev->host_features;
+ }
+ } else {
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ return (vdev->host_features & ~vdc->legacy_features)
+ >> (32 * proxy->host_features_sel);
+ }
+ case VIRTIO_MMIO_QUEUE_NUM_MAX:
+ if (!virtio_queue_get_num(vdev, vdev->queue_sel)) {
+ return 0;
+ }
+ return VIRTQUEUE_MAX_SIZE;
+ case VIRTIO_MMIO_QUEUE_PFN:
+ if (!proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: read from legacy register (0x%"
+ HWADDR_PRIx ") in non-legacy mode\n",
+ __func__, offset);
+ return 0;
+ }
+ return virtio_queue_get_addr(vdev, vdev->queue_sel)
+ >> proxy->guest_page_shift;
+ case VIRTIO_MMIO_QUEUE_READY:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: read from non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return 0;
+ }
+ return proxy->vqs[vdev->queue_sel].enabled;
+ case VIRTIO_MMIO_INTERRUPT_STATUS:
+ return qatomic_read(&vdev->isr);
+ case VIRTIO_MMIO_STATUS:
+ return vdev->status;
+ case VIRTIO_MMIO_CONFIG_GENERATION:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: read from non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return 0;
+ }
+ return vdev->generation;
+ case VIRTIO_MMIO_SHM_LEN_LOW:
+ case VIRTIO_MMIO_SHM_LEN_HIGH:
+ /*
+ * VIRTIO_MMIO_SHM_SEL is unimplemented
+ * according to the linux driver, if region length is -1
+ * the shared memory doesn't exist
+ */
+ return -1;
+ case VIRTIO_MMIO_DEVICE_FEATURES_SEL:
+ case VIRTIO_MMIO_DRIVER_FEATURES:
+ case VIRTIO_MMIO_DRIVER_FEATURES_SEL:
+ case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+ case VIRTIO_MMIO_QUEUE_SEL:
+ case VIRTIO_MMIO_QUEUE_NUM:
+ case VIRTIO_MMIO_QUEUE_ALIGN:
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ case VIRTIO_MMIO_QUEUE_DESC_LOW:
+ case VIRTIO_MMIO_QUEUE_DESC_HIGH:
+ case VIRTIO_MMIO_QUEUE_AVAIL_LOW:
+ case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
+ case VIRTIO_MMIO_QUEUE_USED_LOW:
+ case VIRTIO_MMIO_QUEUE_USED_HIGH:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: read of write-only register (0x%" HWADDR_PRIx ")\n",
+ __func__, offset);
+ return 0;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: bad register offset (0x%" HWADDR_PRIx ")\n",
+ __func__, offset);
+ return 0;
+ }
+ return 0;
+}
+
+static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value,
+ unsigned size)
+{
+ VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ trace_virtio_mmio_write_offset(offset, value);
+
+ if (!vdev) {
+ /* If no backend is present, we just make all registers
+ * write-ignored. This allows us to provide transports with
+ * no backend plugged in.
+ */
+ return;
+ }
+
+ if (offset >= VIRTIO_MMIO_CONFIG) {
+ offset -= VIRTIO_MMIO_CONFIG;
+ if (proxy->legacy) {
+ switch (size) {
+ case 1:
+ virtio_config_writeb(vdev, offset, value);
+ break;
+ case 2:
+ virtio_config_writew(vdev, offset, value);
+ break;
+ case 4:
+ virtio_config_writel(vdev, offset, value);
+ break;
+ default:
+ abort();
+ }
+ return;
+ } else {
+ switch (size) {
+ case 1:
+ virtio_config_modern_writeb(vdev, offset, value);
+ break;
+ case 2:
+ virtio_config_modern_writew(vdev, offset, value);
+ break;
+ case 4:
+ virtio_config_modern_writel(vdev, offset, value);
+ break;
+ default:
+ abort();
+ }
+ return;
+ }
+ }
+ if (size != 4) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: wrong size access to register!\n",
+ __func__);
+ return;
+ }
+ switch (offset) {
+ case VIRTIO_MMIO_DEVICE_FEATURES_SEL:
+ if (value) {
+ proxy->host_features_sel = 1;
+ } else {
+ proxy->host_features_sel = 0;
+ }
+ break;
+ case VIRTIO_MMIO_DRIVER_FEATURES:
+ if (proxy->legacy) {
+ if (proxy->guest_features_sel) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: attempt to write guest features with "
+ "guest_features_sel > 0 in legacy mode\n",
+ __func__);
+ } else {
+ virtio_set_features(vdev, value);
+ }
+ } else {
+ proxy->guest_features[proxy->guest_features_sel] = value;
+ }
+ break;
+ case VIRTIO_MMIO_DRIVER_FEATURES_SEL:
+ if (value) {
+ proxy->guest_features_sel = 1;
+ } else {
+ proxy->guest_features_sel = 0;
+ }
+ break;
+ case VIRTIO_MMIO_GUEST_PAGE_SIZE:
+ if (!proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to legacy register (0x%"
+ HWADDR_PRIx ") in non-legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->guest_page_shift = ctz32(value);
+ if (proxy->guest_page_shift > 31) {
+ proxy->guest_page_shift = 0;
+ }
+ trace_virtio_mmio_guest_page(value, proxy->guest_page_shift);
+ break;
+ case VIRTIO_MMIO_QUEUE_SEL:
+ if (value < VIRTIO_QUEUE_MAX) {
+ vdev->queue_sel = value;
+ }
+ break;
+ case VIRTIO_MMIO_QUEUE_NUM:
+ trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE);
+ virtio_queue_set_num(vdev, vdev->queue_sel, value);
+
+ if (proxy->legacy) {
+ virtio_queue_update_rings(vdev, vdev->queue_sel);
+ } else {
+ proxy->vqs[vdev->queue_sel].num = value;
+ }
+ break;
+ case VIRTIO_MMIO_QUEUE_ALIGN:
+ if (!proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to legacy register (0x%"
+ HWADDR_PRIx ") in non-legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ virtio_queue_set_align(vdev, vdev->queue_sel, value);
+ break;
+ case VIRTIO_MMIO_QUEUE_PFN:
+ if (!proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to legacy register (0x%"
+ HWADDR_PRIx ") in non-legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ if (value == 0) {
+ virtio_mmio_soft_reset(proxy);
+ } else {
+ virtio_queue_set_addr(vdev, vdev->queue_sel,
+ value << proxy->guest_page_shift);
+ }
+ break;
+ case VIRTIO_MMIO_QUEUE_READY:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ if (value) {
+ virtio_queue_set_num(vdev, vdev->queue_sel,
+ proxy->vqs[vdev->queue_sel].num);
+ virtio_queue_set_rings(vdev, vdev->queue_sel,
+ ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].desc[0],
+ ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].avail[0],
+ ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].used[0]);
+ proxy->vqs[vdev->queue_sel].enabled = 1;
+ } else {
+ proxy->vqs[vdev->queue_sel].enabled = 0;
+ }
+ break;
+ case VIRTIO_MMIO_QUEUE_NOTIFY:
+ if (value < VIRTIO_QUEUE_MAX) {
+ virtio_queue_notify(vdev, value);
+ }
+ break;
+ case VIRTIO_MMIO_INTERRUPT_ACK:
+ qatomic_and(&vdev->isr, ~value);
+ virtio_update_irq(vdev);
+ break;
+ case VIRTIO_MMIO_STATUS:
+ if (!(value & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ virtio_mmio_stop_ioeventfd(proxy);
+ }
+
+ if (!proxy->legacy && (value & VIRTIO_CONFIG_S_FEATURES_OK)) {
+ virtio_set_features(vdev,
+ ((uint64_t)proxy->guest_features[1]) << 32 |
+ proxy->guest_features[0]);
+ }
+
+ virtio_set_status(vdev, value & 0xff);
+
+ if (value & VIRTIO_CONFIG_S_DRIVER_OK) {
+ virtio_mmio_start_ioeventfd(proxy);
+ }
+
+ if (vdev->status == 0) {
+ virtio_mmio_soft_reset(proxy);
+ }
+ break;
+ case VIRTIO_MMIO_QUEUE_DESC_LOW:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].desc[0] = value;
+ break;
+ case VIRTIO_MMIO_QUEUE_DESC_HIGH:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].desc[1] = value;
+ break;
+ case VIRTIO_MMIO_QUEUE_AVAIL_LOW:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].avail[0] = value;
+ break;
+ case VIRTIO_MMIO_QUEUE_AVAIL_HIGH:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].avail[1] = value;
+ break;
+ case VIRTIO_MMIO_QUEUE_USED_LOW:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].used[0] = value;
+ break;
+ case VIRTIO_MMIO_QUEUE_USED_HIGH:
+ if (proxy->legacy) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to non-legacy register (0x%"
+ HWADDR_PRIx ") in legacy mode\n",
+ __func__, offset);
+ return;
+ }
+ proxy->vqs[vdev->queue_sel].used[1] = value;
+ break;
+ case VIRTIO_MMIO_MAGIC_VALUE:
+ case VIRTIO_MMIO_VERSION:
+ case VIRTIO_MMIO_DEVICE_ID:
+ case VIRTIO_MMIO_VENDOR_ID:
+ case VIRTIO_MMIO_DEVICE_FEATURES:
+ case VIRTIO_MMIO_QUEUE_NUM_MAX:
+ case VIRTIO_MMIO_INTERRUPT_STATUS:
+ case VIRTIO_MMIO_CONFIG_GENERATION:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: write to read-only register (0x%" HWADDR_PRIx ")\n",
+ __func__, offset);
+ break;
+
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: bad register offset (0x%" HWADDR_PRIx ")\n",
+ __func__, offset);
+ }
+}
+
+static const MemoryRegionOps virtio_legacy_mem_ops = {
+ .read = virtio_mmio_read,
+ .write = virtio_mmio_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const MemoryRegionOps virtio_mem_ops = {
+ .read = virtio_mmio_read,
+ .write = virtio_mmio_write,
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void virtio_mmio_update_irq(DeviceState *opaque, uint16_t vector)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ int level;
+
+ if (!vdev) {
+ return;
+ }
+ level = (qatomic_read(&vdev->isr) != 0);
+ trace_virtio_mmio_setting_irq(level);
+ qemu_set_irq(proxy->irq, level);
+}
+
+static int virtio_mmio_load_config(DeviceState *opaque, QEMUFile *f)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+
+ proxy->host_features_sel = qemu_get_be32(f);
+ proxy->guest_features_sel = qemu_get_be32(f);
+ proxy->guest_page_shift = qemu_get_be32(f);
+ return 0;
+}
+
+static void virtio_mmio_save_config(DeviceState *opaque, QEMUFile *f)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+
+ qemu_put_be32(f, proxy->host_features_sel);
+ qemu_put_be32(f, proxy->guest_features_sel);
+ qemu_put_be32(f, proxy->guest_page_shift);
+}
+
+static const VMStateDescription vmstate_virtio_mmio_queue_state = {
+ .name = "virtio_mmio/queue_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT16(num, VirtIOMMIOQueue),
+ VMSTATE_BOOL(enabled, VirtIOMMIOQueue),
+ VMSTATE_UINT32_ARRAY(desc, VirtIOMMIOQueue, 2),
+ VMSTATE_UINT32_ARRAY(avail, VirtIOMMIOQueue, 2),
+ VMSTATE_UINT32_ARRAY(used, VirtIOMMIOQueue, 2),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_mmio_state_sub = {
+ .name = "virtio_mmio/state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32_ARRAY(guest_features, VirtIOMMIOProxy, 2),
+ VMSTATE_STRUCT_ARRAY(vqs, VirtIOMMIOProxy, VIRTIO_QUEUE_MAX, 0,
+ vmstate_virtio_mmio_queue_state,
+ VirtIOMMIOQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_mmio = {
+ .name = "virtio_mmio",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * []) {
+ &vmstate_virtio_mmio_state_sub,
+ NULL
+ }
+};
+
+static void virtio_mmio_save_extra_state(DeviceState *opaque, QEMUFile *f)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+
+ vmstate_save_state(f, &vmstate_virtio_mmio, proxy, NULL);
+}
+
+static int virtio_mmio_load_extra_state(DeviceState *opaque, QEMUFile *f)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+
+ return vmstate_load_state(f, &vmstate_virtio_mmio, proxy, 1);
+}
+
+static bool virtio_mmio_has_extra_state(DeviceState *opaque)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque);
+
+ return !proxy->legacy;
+}
+
+static void virtio_mmio_reset(DeviceState *d)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+ int i;
+
+ virtio_mmio_soft_reset(proxy);
+
+ proxy->host_features_sel = 0;
+ proxy->guest_features_sel = 0;
+ proxy->guest_page_shift = 0;
+
+ if (!proxy->legacy) {
+ proxy->guest_features[0] = proxy->guest_features[1] = 0;
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ proxy->vqs[i].num = 0;
+ proxy->vqs[i].desc[0] = proxy->vqs[i].desc[1] = 0;
+ proxy->vqs[i].avail[0] = proxy->vqs[i].avail[1] = 0;
+ proxy->vqs[i].used[0] = proxy->vqs[i].used[1] = 0;
+ }
+ }
+}
+
+static int virtio_mmio_set_guest_notifier(DeviceState *d, int n, bool assign,
+ bool with_irqfd)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ VirtQueue *vq = virtio_get_queue(vdev, n);
+ EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
+
+ if (assign) {
+ int r = event_notifier_init(notifier, 0);
+ if (r < 0) {
+ return r;
+ }
+ virtio_queue_set_guest_notifier_fd_handler(vq, true, with_irqfd);
+ } else {
+ virtio_queue_set_guest_notifier_fd_handler(vq, false, with_irqfd);
+ event_notifier_cleanup(notifier);
+ }
+
+ if (vdc->guest_notifier_mask && vdev->use_guest_notifier_mask) {
+ vdc->guest_notifier_mask(vdev, n, !assign);
+ }
+
+ return 0;
+}
+
+static int virtio_mmio_set_guest_notifiers(DeviceState *d, int nvqs,
+ bool assign)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ /* TODO: need to check if kvm-arm supports irqfd */
+ bool with_irqfd = false;
+ int r, n;
+
+ nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX);
+
+ for (n = 0; n < nvqs; n++) {
+ if (!virtio_queue_get_num(vdev, n)) {
+ break;
+ }
+
+ r = virtio_mmio_set_guest_notifier(d, n, assign, with_irqfd);
+ if (r < 0) {
+ goto assign_error;
+ }
+ }
+
+ return 0;
+
+assign_error:
+ /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */
+ assert(assign);
+ while (--n >= 0) {
+ virtio_mmio_set_guest_notifier(d, n, !assign, false);
+ }
+ return r;
+}
+
+static void virtio_mmio_pre_plugged(DeviceState *d, Error **errp)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (!proxy->legacy) {
+ virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1);
+ }
+}
+
+/* virtio-mmio device */
+
+static Property virtio_mmio_properties[] = {
+ DEFINE_PROP_BOOL("format_transport_address", VirtIOMMIOProxy,
+ format_transport_address, true),
+ DEFINE_PROP_BOOL("force-legacy", VirtIOMMIOProxy, legacy, true),
+ DEFINE_PROP_BIT("ioeventfd", VirtIOMMIOProxy, flags,
+ VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_mmio_realizefn(DeviceState *d, Error **errp)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+ SysBusDevice *sbd = SYS_BUS_DEVICE(d);
+
+ qbus_init(&proxy->bus, sizeof(proxy->bus), TYPE_VIRTIO_MMIO_BUS, d, NULL);
+ sysbus_init_irq(sbd, &proxy->irq);
+
+ if (!kvm_eventfds_enabled()) {
+ proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD;
+ }
+
+ /* fd-based ioevents can't be synchronized in record/replay */
+ if (replay_mode != REPLAY_MODE_NONE) {
+ proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD;
+ }
+
+ if (proxy->legacy) {
+ memory_region_init_io(&proxy->iomem, OBJECT(d),
+ &virtio_legacy_mem_ops, proxy,
+ TYPE_VIRTIO_MMIO, 0x200);
+ } else {
+ memory_region_init_io(&proxy->iomem, OBJECT(d),
+ &virtio_mem_ops, proxy,
+ TYPE_VIRTIO_MMIO, 0x200);
+ }
+ sysbus_init_mmio(sbd, &proxy->iomem);
+}
+
+static void virtio_mmio_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = virtio_mmio_realizefn;
+ dc->reset = virtio_mmio_reset;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ device_class_set_props(dc, virtio_mmio_properties);
+}
+
+static const TypeInfo virtio_mmio_info = {
+ .name = TYPE_VIRTIO_MMIO,
+ .parent = TYPE_SYS_BUS_DEVICE,
+ .instance_size = sizeof(VirtIOMMIOProxy),
+ .class_init = virtio_mmio_class_init,
+};
+
+/* virtio-mmio-bus. */
+
+static char *virtio_mmio_bus_get_dev_path(DeviceState *dev)
+{
+ BusState *virtio_mmio_bus;
+ VirtIOMMIOProxy *virtio_mmio_proxy;
+ char *proxy_path;
+ char *path;
+ MemoryRegionSection section;
+
+ virtio_mmio_bus = qdev_get_parent_bus(dev);
+ virtio_mmio_proxy = VIRTIO_MMIO(virtio_mmio_bus->parent);
+ proxy_path = qdev_get_dev_path(DEVICE(virtio_mmio_proxy));
+
+ /*
+ * If @format_transport_address is false, then we just perform the same as
+ * virtio_bus_get_dev_path(): we delegate the address formatting for the
+ * device on the virtio-mmio bus to the bus that the virtio-mmio proxy
+ * (i.e., the device that implements the virtio-mmio bus) resides on. In
+ * this case the base address of the virtio-mmio transport will be
+ * invisible.
+ */
+ if (!virtio_mmio_proxy->format_transport_address) {
+ return proxy_path;
+ }
+
+ /* Otherwise, we append the base address of the transport. */
+ section = memory_region_find(&virtio_mmio_proxy->iomem, 0, 0x200);
+ assert(section.mr);
+
+ if (proxy_path) {
+ path = g_strdup_printf("%s/virtio-mmio@" TARGET_FMT_plx, proxy_path,
+ section.offset_within_address_space);
+ } else {
+ path = g_strdup_printf("virtio-mmio@" TARGET_FMT_plx,
+ section.offset_within_address_space);
+ }
+ memory_region_unref(section.mr);
+
+ g_free(proxy_path);
+ return path;
+}
+
+static void virtio_mmio_vmstate_change(DeviceState *d, bool running)
+{
+ VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d);
+
+ if (running) {
+ virtio_mmio_start_ioeventfd(proxy);
+ } else {
+ virtio_mmio_stop_ioeventfd(proxy);
+ }
+}
+
+static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data)
+{
+ BusClass *bus_class = BUS_CLASS(klass);
+ VirtioBusClass *k = VIRTIO_BUS_CLASS(klass);
+
+ k->notify = virtio_mmio_update_irq;
+ k->save_config = virtio_mmio_save_config;
+ k->load_config = virtio_mmio_load_config;
+ k->save_extra_state = virtio_mmio_save_extra_state;
+ k->load_extra_state = virtio_mmio_load_extra_state;
+ k->has_extra_state = virtio_mmio_has_extra_state;
+ k->set_guest_notifiers = virtio_mmio_set_guest_notifiers;
+ k->ioeventfd_enabled = virtio_mmio_ioeventfd_enabled;
+ k->ioeventfd_assign = virtio_mmio_ioeventfd_assign;
+ k->pre_plugged = virtio_mmio_pre_plugged;
+ k->vmstate_change = virtio_mmio_vmstate_change;
+ k->has_variable_vring_alignment = true;
+ bus_class->max_dev = 1;
+ bus_class->get_dev_path = virtio_mmio_bus_get_dev_path;
+}
+
+static const TypeInfo virtio_mmio_bus_info = {
+ .name = TYPE_VIRTIO_MMIO_BUS,
+ .parent = TYPE_VIRTIO_BUS,
+ .instance_size = sizeof(VirtioBusState),
+ .class_init = virtio_mmio_bus_class_init,
+};
+
+static void virtio_mmio_register_types(void)
+{
+ type_register_static(&virtio_mmio_bus_info);
+ type_register_static(&virtio_mmio_info);
+}
+
+type_init(virtio_mmio_register_types)
diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c
new file mode 100644
index 00000000..e03543a7
--- /dev/null
+++ b/hw/virtio/virtio-net-pci.c
@@ -0,0 +1,108 @@
+/*
+ * Virtio net PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-net.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIONetPCI VirtIONetPCI;
+
+/*
+ * virtio-net-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_NET_PCI "virtio-net-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIONetPCI, VIRTIO_NET_PCI,
+ TYPE_VIRTIO_NET_PCI)
+
+struct VirtIONetPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIONet vdev;
+};
+
+static Property virtio_net_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ DeviceState *qdev = DEVICE(vpci_dev);
+ VirtIONetPCI *dev = VIRTIO_NET_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ VirtIONet *net = VIRTIO_NET(vdev);
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = 2 * MAX(net->nic_conf.peers.queues, 1)
+ + 1 /* Config interrupt */
+ + 1 /* Control vq */;
+ }
+
+ virtio_net_set_netclient_name(&dev->vdev, qdev->id,
+ object_get_typename(OBJECT(qdev)));
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_net_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+ VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass);
+
+ k->romfile = "efi-virtio.rom";
+ k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ k->device_id = PCI_DEVICE_ID_VIRTIO_NET;
+ k->revision = VIRTIO_PCI_ABI_VERSION;
+ k->class_id = PCI_CLASS_NETWORK_ETHERNET;
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+ device_class_set_props(dc, virtio_net_properties);
+ vpciklass->realize = virtio_net_pci_realize;
+}
+
+static void virtio_net_pci_instance_init(Object *obj)
+{
+ VirtIONetPCI *dev = VIRTIO_NET_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_NET);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_net_pci_info = {
+ .base_name = TYPE_VIRTIO_NET_PCI,
+ .generic_name = "virtio-net-pci",
+ .transitional_name = "virtio-net-pci-transitional",
+ .non_transitional_name = "virtio-net-pci-non-transitional",
+ .instance_size = sizeof(VirtIONetPCI),
+ .instance_init = virtio_net_pci_instance_init,
+ .class_init = virtio_net_pci_class_init,
+};
+
+static void virtio_net_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_net_pci_info);
+}
+
+type_init(virtio_net_pci_register)
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
new file mode 100644
index 00000000..a1c9dfa7
--- /dev/null
+++ b/hw/virtio/virtio-pci.c
@@ -0,0 +1,2298 @@
+/*
+ * Virtio PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "exec/memop.h"
+#include "standard-headers/linux/virtio_pci.h"
+#include "hw/boards.h"
+#include "hw/virtio/virtio.h"
+#include "migration/qemu-file-types.h"
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/loader.h"
+#include "sysemu/kvm.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qemu/range.h"
+#include "hw/virtio/virtio-bus.h"
+#include "qapi/visitor.h"
+#include "sysemu/replay.h"
+#include "trace.h"
+
+#define VIRTIO_PCI_REGION_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_present(dev))
+
+#undef VIRTIO_PCI_CONFIG
+
+/* The remaining space is defined by each driver as the per-driver
+ * configuration space */
+#define VIRTIO_PCI_CONFIG_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev))
+
+static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
+ VirtIOPCIProxy *dev);
+static void virtio_pci_reset(DeviceState *qdev);
+
+/* virtio device */
+/* DeviceState to VirtIOPCIProxy. For use off data-path. TODO: use QOM. */
+static inline VirtIOPCIProxy *to_virtio_pci_proxy(DeviceState *d)
+{
+ return container_of(d, VirtIOPCIProxy, pci_dev.qdev);
+}
+
+/* DeviceState to VirtIOPCIProxy. Note: used on datapath,
+ * be careful and test performance if you change this.
+ */
+static inline VirtIOPCIProxy *to_virtio_pci_proxy_fast(DeviceState *d)
+{
+ return container_of(d, VirtIOPCIProxy, pci_dev.qdev);
+}
+
+static void virtio_pci_notify(DeviceState *d, uint16_t vector)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d);
+
+ if (msix_enabled(&proxy->pci_dev)) {
+ if (vector != VIRTIO_NO_VECTOR) {
+ msix_notify(&proxy->pci_dev, vector);
+ }
+ } else {
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ pci_set_irq(&proxy->pci_dev, qatomic_read(&vdev->isr) & 1);
+ }
+}
+
+static void virtio_pci_save_config(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ pci_device_save(&proxy->pci_dev, f);
+ msix_save(&proxy->pci_dev, f);
+ if (msix_present(&proxy->pci_dev))
+ qemu_put_be16(f, vdev->config_vector);
+}
+
+static const VMStateDescription vmstate_virtio_pci_modern_queue_state = {
+ .name = "virtio_pci/modern_queue_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT16(num, VirtIOPCIQueue),
+ VMSTATE_UNUSED(1), /* enabled was stored as be16 */
+ VMSTATE_BOOL(enabled, VirtIOPCIQueue),
+ VMSTATE_UINT32_ARRAY(desc, VirtIOPCIQueue, 2),
+ VMSTATE_UINT32_ARRAY(avail, VirtIOPCIQueue, 2),
+ VMSTATE_UINT32_ARRAY(used, VirtIOPCIQueue, 2),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static bool virtio_pci_modern_state_needed(void *opaque)
+{
+ VirtIOPCIProxy *proxy = opaque;
+
+ return virtio_pci_modern(proxy);
+}
+
+static const VMStateDescription vmstate_virtio_pci_modern_state_sub = {
+ .name = "virtio_pci/modern_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_pci_modern_state_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(dfselect, VirtIOPCIProxy),
+ VMSTATE_UINT32(gfselect, VirtIOPCIProxy),
+ VMSTATE_UINT32_ARRAY(guest_features, VirtIOPCIProxy, 2),
+ VMSTATE_STRUCT_ARRAY(vqs, VirtIOPCIProxy, VIRTIO_QUEUE_MAX, 0,
+ vmstate_virtio_pci_modern_queue_state,
+ VirtIOPCIQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_pci = {
+ .name = "virtio_pci",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription*[]) {
+ &vmstate_virtio_pci_modern_state_sub,
+ NULL
+ }
+};
+
+static bool virtio_pci_has_extra_state(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA;
+}
+
+static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL);
+}
+
+static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1);
+}
+
+static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (msix_present(&proxy->pci_dev))
+ qemu_put_be16(f, virtio_queue_vector(vdev, n));
+}
+
+static int virtio_pci_load_config(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint16_t vector;
+
+ int ret;
+ ret = pci_device_load(&proxy->pci_dev, f);
+ if (ret) {
+ return ret;
+ }
+ msix_unuse_all_vectors(&proxy->pci_dev);
+ msix_load(&proxy->pci_dev, f);
+ if (msix_present(&proxy->pci_dev)) {
+ qemu_get_be16s(f, &vector);
+
+ if (vector != VIRTIO_NO_VECTOR && vector >= proxy->nvectors) {
+ return -EINVAL;
+ }
+ } else {
+ vector = VIRTIO_NO_VECTOR;
+ }
+ vdev->config_vector = vector;
+ if (vector != VIRTIO_NO_VECTOR) {
+ msix_vector_use(&proxy->pci_dev, vector);
+ }
+ return 0;
+}
+
+static int virtio_pci_load_queue(DeviceState *d, int n, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ uint16_t vector;
+ if (msix_present(&proxy->pci_dev)) {
+ qemu_get_be16s(f, &vector);
+ if (vector != VIRTIO_NO_VECTOR && vector >= proxy->nvectors) {
+ return -EINVAL;
+ }
+ } else {
+ vector = VIRTIO_NO_VECTOR;
+ }
+ virtio_queue_set_vector(vdev, n, vector);
+ if (vector != VIRTIO_NO_VECTOR) {
+ msix_vector_use(&proxy->pci_dev, vector);
+ }
+
+ return 0;
+}
+
+static bool virtio_pci_ioeventfd_enabled(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ return (proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) != 0;
+}
+
+#define QEMU_VIRTIO_PCI_QUEUE_MEM_MULT 0x1000
+
+static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy *proxy)
+{
+ return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ?
+ QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4;
+}
+
+static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier *notifier,
+ int n, bool assign)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtQueue *vq = virtio_get_queue(vdev, n);
+ bool legacy = virtio_pci_legacy(proxy);
+ bool modern = virtio_pci_modern(proxy);
+ bool fast_mmio = kvm_ioeventfd_any_length_enabled();
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
+ MemoryRegion *modern_mr = &proxy->notify.mr;
+ MemoryRegion *modern_notify_mr = &proxy->notify_pio.mr;
+ MemoryRegion *legacy_mr = &proxy->bar;
+ hwaddr modern_addr = virtio_pci_queue_mem_mult(proxy) *
+ virtio_get_queue_index(vq);
+ hwaddr legacy_addr = VIRTIO_PCI_QUEUE_NOTIFY;
+
+ if (assign) {
+ if (modern) {
+ if (fast_mmio) {
+ memory_region_add_eventfd(modern_mr, modern_addr, 0,
+ false, n, notifier);
+ } else {
+ memory_region_add_eventfd(modern_mr, modern_addr, 2,
+ false, n, notifier);
+ }
+ if (modern_pio) {
+ memory_region_add_eventfd(modern_notify_mr, 0, 2,
+ true, n, notifier);
+ }
+ }
+ if (legacy) {
+ memory_region_add_eventfd(legacy_mr, legacy_addr, 2,
+ true, n, notifier);
+ }
+ } else {
+ if (modern) {
+ if (fast_mmio) {
+ memory_region_del_eventfd(modern_mr, modern_addr, 0,
+ false, n, notifier);
+ } else {
+ memory_region_del_eventfd(modern_mr, modern_addr, 2,
+ false, n, notifier);
+ }
+ if (modern_pio) {
+ memory_region_del_eventfd(modern_notify_mr, 0, 2,
+ true, n, notifier);
+ }
+ }
+ if (legacy) {
+ memory_region_del_eventfd(legacy_mr, legacy_addr, 2,
+ true, n, notifier);
+ }
+ }
+ return 0;
+}
+
+static void virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy)
+{
+ virtio_bus_start_ioeventfd(&proxy->bus);
+}
+
+static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy)
+{
+ virtio_bus_stop_ioeventfd(&proxy->bus);
+}
+
+static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint16_t vector;
+ hwaddr pa;
+
+ switch (addr) {
+ case VIRTIO_PCI_GUEST_FEATURES:
+ /* Guest does not negotiate properly? We have to assume nothing. */
+ if (val & (1 << VIRTIO_F_BAD_FEATURE)) {
+ val = virtio_bus_get_vdev_bad_features(&proxy->bus);
+ }
+ virtio_set_features(vdev, val);
+ break;
+ case VIRTIO_PCI_QUEUE_PFN:
+ pa = (hwaddr)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+ if (pa == 0) {
+ virtio_pci_reset(DEVICE(proxy));
+ }
+ else
+ virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
+ break;
+ case VIRTIO_PCI_QUEUE_SEL:
+ if (val < VIRTIO_QUEUE_MAX)
+ vdev->queue_sel = val;
+ break;
+ case VIRTIO_PCI_QUEUE_NOTIFY:
+ if (val < VIRTIO_QUEUE_MAX) {
+ virtio_queue_notify(vdev, val);
+ }
+ break;
+ case VIRTIO_PCI_STATUS:
+ if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ virtio_pci_stop_ioeventfd(proxy);
+ }
+
+ virtio_set_status(vdev, val & 0xFF);
+
+ if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+ virtio_pci_start_ioeventfd(proxy);
+ }
+
+ if (vdev->status == 0) {
+ virtio_pci_reset(DEVICE(proxy));
+ }
+
+ /* Linux before 2.6.34 drives the device without enabling
+ the PCI device bus master bit. Enable it automatically
+ for the guest. This is a PCI spec violation but so is
+ initiating DMA with bus master bit clear. */
+ if (val == (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER)) {
+ pci_default_write_config(&proxy->pci_dev, PCI_COMMAND,
+ proxy->pci_dev.config[PCI_COMMAND] |
+ PCI_COMMAND_MASTER, 1);
+ }
+ break;
+ case VIRTIO_MSI_CONFIG_VECTOR:
+ if (vdev->config_vector != VIRTIO_NO_VECTOR) {
+ msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
+ }
+ /* Make it possible for guest to discover an error took place. */
+ if (val < proxy->nvectors) {
+ msix_vector_use(&proxy->pci_dev, val);
+ } else {
+ val = VIRTIO_NO_VECTOR;
+ }
+ vdev->config_vector = val;
+ break;
+ case VIRTIO_MSI_QUEUE_VECTOR:
+ vector = virtio_queue_vector(vdev, vdev->queue_sel);
+ if (vector != VIRTIO_NO_VECTOR) {
+ msix_vector_unuse(&proxy->pci_dev, vector);
+ }
+ /* Make it possible for guest to discover an error took place. */
+ if (val < proxy->nvectors) {
+ msix_vector_use(&proxy->pci_dev, val);
+ } else {
+ val = VIRTIO_NO_VECTOR;
+ }
+ virtio_queue_set_vector(vdev, vdev->queue_sel, val);
+ break;
+ default:
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: unexpected address 0x%x value 0x%x\n",
+ __func__, addr, val);
+ break;
+ }
+}
+
+static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint32_t ret = 0xFFFFFFFF;
+
+ switch (addr) {
+ case VIRTIO_PCI_HOST_FEATURES:
+ ret = vdev->host_features;
+ break;
+ case VIRTIO_PCI_GUEST_FEATURES:
+ ret = vdev->guest_features;
+ break;
+ case VIRTIO_PCI_QUEUE_PFN:
+ ret = virtio_queue_get_addr(vdev, vdev->queue_sel)
+ >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+ break;
+ case VIRTIO_PCI_QUEUE_NUM:
+ ret = virtio_queue_get_num(vdev, vdev->queue_sel);
+ break;
+ case VIRTIO_PCI_QUEUE_SEL:
+ ret = vdev->queue_sel;
+ break;
+ case VIRTIO_PCI_STATUS:
+ ret = vdev->status;
+ break;
+ case VIRTIO_PCI_ISR:
+ /* reading from the ISR also clears it. */
+ ret = qatomic_xchg(&vdev->isr, 0);
+ pci_irq_deassert(&proxy->pci_dev);
+ break;
+ case VIRTIO_MSI_CONFIG_VECTOR:
+ ret = vdev->config_vector;
+ break;
+ case VIRTIO_MSI_QUEUE_VECTOR:
+ ret = virtio_queue_vector(vdev, vdev->queue_sel);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev);
+ uint64_t val = 0;
+
+ if (vdev == NULL) {
+ return UINT64_MAX;
+ }
+
+ if (addr < config) {
+ return virtio_ioport_read(proxy, addr);
+ }
+ addr -= config;
+
+ switch (size) {
+ case 1:
+ val = virtio_config_readb(vdev, addr);
+ break;
+ case 2:
+ val = virtio_config_readw(vdev, addr);
+ if (virtio_is_big_endian(vdev)) {
+ val = bswap16(val);
+ }
+ break;
+ case 4:
+ val = virtio_config_readl(vdev, addr);
+ if (virtio_is_big_endian(vdev)) {
+ val = bswap32(val);
+ }
+ break;
+ }
+ return val;
+}
+
+static void virtio_pci_config_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (vdev == NULL) {
+ return;
+ }
+
+ if (addr < config) {
+ virtio_ioport_write(proxy, addr, val);
+ return;
+ }
+ addr -= config;
+ /*
+ * Virtio-PCI is odd. Ioports are LE but config space is target native
+ * endian.
+ */
+ switch (size) {
+ case 1:
+ virtio_config_writeb(vdev, addr, val);
+ break;
+ case 2:
+ if (virtio_is_big_endian(vdev)) {
+ val = bswap16(val);
+ }
+ virtio_config_writew(vdev, addr, val);
+ break;
+ case 4:
+ if (virtio_is_big_endian(vdev)) {
+ val = bswap32(val);
+ }
+ virtio_config_writel(vdev, addr, val);
+ break;
+ }
+}
+
+static const MemoryRegionOps virtio_pci_config_ops = {
+ .read = virtio_pci_config_read,
+ .write = virtio_pci_config_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static MemoryRegion *virtio_address_space_lookup(VirtIOPCIProxy *proxy,
+ hwaddr *off, int len)
+{
+ int i;
+ VirtIOPCIRegion *reg;
+
+ for (i = 0; i < ARRAY_SIZE(proxy->regs); ++i) {
+ reg = &proxy->regs[i];
+ if (*off >= reg->offset &&
+ *off + len <= reg->offset + reg->size) {
+ *off -= reg->offset;
+ return &reg->mr;
+ }
+ }
+
+ return NULL;
+}
+
+/* Below are generic functions to do memcpy from/to an address space,
+ * without byteswaps, with input validation.
+ *
+ * As regular address_space_* APIs all do some kind of byteswap at least for
+ * some host/target combinations, we are forced to explicitly convert to a
+ * known-endianness integer value.
+ * It doesn't really matter which endian format to go through, so the code
+ * below selects the endian that causes the least amount of work on the given
+ * host.
+ *
+ * Note: host pointer must be aligned.
+ */
+static
+void virtio_address_space_write(VirtIOPCIProxy *proxy, hwaddr addr,
+ const uint8_t *buf, int len)
+{
+ uint64_t val;
+ MemoryRegion *mr;
+
+ /* address_space_* APIs assume an aligned address.
+ * As address is under guest control, handle illegal values.
+ */
+ addr &= ~(len - 1);
+
+ mr = virtio_address_space_lookup(proxy, &addr, len);
+ if (!mr) {
+ return;
+ }
+
+ /* Make sure caller aligned buf properly */
+ assert(!(((uintptr_t)buf) & (len - 1)));
+
+ switch (len) {
+ case 1:
+ val = pci_get_byte(buf);
+ break;
+ case 2:
+ val = pci_get_word(buf);
+ break;
+ case 4:
+ val = pci_get_long(buf);
+ break;
+ default:
+ /* As length is under guest control, handle illegal values. */
+ return;
+ }
+ memory_region_dispatch_write(mr, addr, val, size_memop(len) | MO_LE,
+ MEMTXATTRS_UNSPECIFIED);
+}
+
+static void
+virtio_address_space_read(VirtIOPCIProxy *proxy, hwaddr addr,
+ uint8_t *buf, int len)
+{
+ uint64_t val;
+ MemoryRegion *mr;
+
+ /* address_space_* APIs assume an aligned address.
+ * As address is under guest control, handle illegal values.
+ */
+ addr &= ~(len - 1);
+
+ mr = virtio_address_space_lookup(proxy, &addr, len);
+ if (!mr) {
+ return;
+ }
+
+ /* Make sure caller aligned buf properly */
+ assert(!(((uintptr_t)buf) & (len - 1)));
+
+ memory_region_dispatch_read(mr, addr, &val, size_memop(len) | MO_LE,
+ MEMTXATTRS_UNSPECIFIED);
+ switch (len) {
+ case 1:
+ pci_set_byte(buf, val);
+ break;
+ case 2:
+ pci_set_word(buf, val);
+ break;
+ case 4:
+ pci_set_long(buf, val);
+ break;
+ default:
+ /* As length is under guest control, handle illegal values. */
+ break;
+ }
+}
+
+static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
+ uint32_t val, int len)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ struct virtio_pci_cfg_cap *cfg;
+
+ pci_default_write_config(pci_dev, address, val, len);
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) {
+ pcie_cap_flr_write_config(pci_dev, address, val, len);
+ }
+
+ if (range_covers_byte(address, len, PCI_COMMAND)) {
+ if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+ virtio_set_disabled(vdev, true);
+ virtio_pci_stop_ioeventfd(proxy);
+ virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+ } else {
+ virtio_set_disabled(vdev, false);
+ }
+ }
+
+ if (proxy->config_cap &&
+ ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
+ pci_cfg_data),
+ sizeof cfg->pci_cfg_data)) {
+ uint32_t off;
+ uint32_t len;
+
+ cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
+ off = le32_to_cpu(cfg->cap.offset);
+ len = le32_to_cpu(cfg->cap.length);
+
+ if (len == 1 || len == 2 || len == 4) {
+ assert(len <= sizeof cfg->pci_cfg_data);
+ virtio_address_space_write(proxy, off, cfg->pci_cfg_data, len);
+ }
+ }
+}
+
+static uint32_t virtio_read_config(PCIDevice *pci_dev,
+ uint32_t address, int len)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+ struct virtio_pci_cfg_cap *cfg;
+
+ if (proxy->config_cap &&
+ ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap,
+ pci_cfg_data),
+ sizeof cfg->pci_cfg_data)) {
+ uint32_t off;
+ uint32_t len;
+
+ cfg = (void *)(proxy->pci_dev.config + proxy->config_cap);
+ off = le32_to_cpu(cfg->cap.offset);
+ len = le32_to_cpu(cfg->cap.length);
+
+ if (len == 1 || len == 2 || len == 4) {
+ assert(len <= sizeof cfg->pci_cfg_data);
+ virtio_address_space_read(proxy, off, cfg->pci_cfg_data, len);
+ }
+ }
+
+ return pci_default_read_config(pci_dev, address, len);
+}
+
+static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
+ unsigned int queue_no,
+ unsigned int vector)
+{
+ VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+ int ret;
+
+ if (irqfd->users == 0) {
+ KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+ ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev);
+ if (ret < 0) {
+ return ret;
+ }
+ kvm_irqchip_commit_route_changes(&c);
+ irqfd->virq = ret;
+ }
+ irqfd->users++;
+ return 0;
+}
+
+static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy,
+ unsigned int vector)
+{
+ VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+ if (--irqfd->users == 0) {
+ kvm_irqchip_release_virq(kvm_state, irqfd->virq);
+ }
+}
+
+static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy,
+ unsigned int queue_no,
+ unsigned int vector)
+{
+ VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtQueue *vq = virtio_get_queue(vdev, queue_no);
+ EventNotifier *n = virtio_queue_get_guest_notifier(vq);
+ return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq);
+}
+
+static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy,
+ unsigned int queue_no,
+ unsigned int vector)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtQueue *vq = virtio_get_queue(vdev, queue_no);
+ EventNotifier *n = virtio_queue_get_guest_notifier(vq);
+ VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector];
+ int ret;
+
+ ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq);
+ assert(ret == 0);
+}
+
+static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
+{
+ PCIDevice *dev = &proxy->pci_dev;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ unsigned int vector;
+ int ret, queue_no;
+
+ for (queue_no = 0; queue_no < nvqs; queue_no++) {
+ if (!virtio_queue_get_num(vdev, queue_no)) {
+ break;
+ }
+ vector = virtio_queue_vector(vdev, queue_no);
+ if (vector >= msix_nr_vectors_allocated(dev)) {
+ continue;
+ }
+ ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector);
+ if (ret < 0) {
+ goto undo;
+ }
+ /* If guest supports masking, set up irqfd now.
+ * Otherwise, delay until unmasked in the frontend.
+ */
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+ ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
+ if (ret < 0) {
+ kvm_virtio_pci_vq_vector_release(proxy, vector);
+ goto undo;
+ }
+ }
+ }
+ return 0;
+
+undo:
+ while (--queue_no >= 0) {
+ vector = virtio_queue_vector(vdev, queue_no);
+ if (vector >= msix_nr_vectors_allocated(dev)) {
+ continue;
+ }
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+ kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
+ }
+ kvm_virtio_pci_vq_vector_release(proxy, vector);
+ }
+ return ret;
+}
+
+static void kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs)
+{
+ PCIDevice *dev = &proxy->pci_dev;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ unsigned int vector;
+ int queue_no;
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ for (queue_no = 0; queue_no < nvqs; queue_no++) {
+ if (!virtio_queue_get_num(vdev, queue_no)) {
+ break;
+ }
+ vector = virtio_queue_vector(vdev, queue_no);
+ if (vector >= msix_nr_vectors_allocated(dev)) {
+ continue;
+ }
+ /* If guest supports masking, clean up irqfd now.
+ * Otherwise, it was cleaned when masked in the frontend.
+ */
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+ kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
+ }
+ kvm_virtio_pci_vq_vector_release(proxy, vector);
+ }
+}
+
+static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
+ unsigned int queue_no,
+ unsigned int vector,
+ MSIMessage msg)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ VirtQueue *vq = virtio_get_queue(vdev, queue_no);
+ EventNotifier *n = virtio_queue_get_guest_notifier(vq);
+ VirtIOIRQFD *irqfd;
+ int ret = 0;
+
+ if (proxy->vector_irqfd) {
+ irqfd = &proxy->vector_irqfd[vector];
+ if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
+ ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg,
+ &proxy->pci_dev);
+ if (ret < 0) {
+ return ret;
+ }
+ kvm_irqchip_commit_routes(kvm_state);
+ }
+ }
+
+ /* If guest supports masking, irqfd is already setup, unmask it.
+ * Otherwise, set it up now.
+ */
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+ k->guest_notifier_mask(vdev, queue_no, false);
+ /* Test after unmasking to avoid losing events. */
+ if (k->guest_notifier_pending &&
+ k->guest_notifier_pending(vdev, queue_no)) {
+ event_notifier_set(n);
+ }
+ } else {
+ ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
+ }
+ return ret;
+}
+
+static void virtio_pci_vq_vector_mask(VirtIOPCIProxy *proxy,
+ unsigned int queue_no,
+ unsigned int vector)
+{
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ /* If guest supports masking, keep irqfd but mask it.
+ * Otherwise, clean it up now.
+ */
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
+ k->guest_notifier_mask(vdev, queue_no, true);
+ } else {
+ kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
+ }
+}
+
+static int virtio_pci_vector_unmask(PCIDevice *dev, unsigned vector,
+ MSIMessage msg)
+{
+ VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtQueue *vq = virtio_vector_first_queue(vdev, vector);
+ int ret, index, unmasked = 0;
+
+ while (vq) {
+ index = virtio_get_queue_index(vq);
+ if (!virtio_queue_get_num(vdev, index)) {
+ break;
+ }
+ if (index < proxy->nvqs_with_notifiers) {
+ ret = virtio_pci_vq_vector_unmask(proxy, index, vector, msg);
+ if (ret < 0) {
+ goto undo;
+ }
+ ++unmasked;
+ }
+ vq = virtio_vector_next_queue(vq);
+ }
+
+ return 0;
+
+undo:
+ vq = virtio_vector_first_queue(vdev, vector);
+ while (vq && unmasked >= 0) {
+ index = virtio_get_queue_index(vq);
+ if (index < proxy->nvqs_with_notifiers) {
+ virtio_pci_vq_vector_mask(proxy, index, vector);
+ --unmasked;
+ }
+ vq = virtio_vector_next_queue(vq);
+ }
+ return ret;
+}
+
+static void virtio_pci_vector_mask(PCIDevice *dev, unsigned vector)
+{
+ VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtQueue *vq = virtio_vector_first_queue(vdev, vector);
+ int index;
+
+ while (vq) {
+ index = virtio_get_queue_index(vq);
+ if (!virtio_queue_get_num(vdev, index)) {
+ break;
+ }
+ if (index < proxy->nvqs_with_notifiers) {
+ virtio_pci_vq_vector_mask(proxy, index, vector);
+ }
+ vq = virtio_vector_next_queue(vq);
+ }
+}
+
+static void virtio_pci_vector_poll(PCIDevice *dev,
+ unsigned int vector_start,
+ unsigned int vector_end)
+{
+ VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ int queue_no;
+ unsigned int vector;
+ EventNotifier *notifier;
+ VirtQueue *vq;
+
+ for (queue_no = 0; queue_no < proxy->nvqs_with_notifiers; queue_no++) {
+ if (!virtio_queue_get_num(vdev, queue_no)) {
+ break;
+ }
+ vector = virtio_queue_vector(vdev, queue_no);
+ if (vector < vector_start || vector >= vector_end ||
+ !msix_is_masked(dev, vector)) {
+ continue;
+ }
+ vq = virtio_get_queue(vdev, queue_no);
+ notifier = virtio_queue_get_guest_notifier(vq);
+ if (k->guest_notifier_pending) {
+ if (k->guest_notifier_pending(vdev, queue_no)) {
+ msix_set_pending(dev, vector);
+ }
+ } else if (event_notifier_test_and_clear(notifier)) {
+ msix_set_pending(dev, vector);
+ }
+ }
+}
+
+static int virtio_pci_set_guest_notifier(DeviceState *d, int n, bool assign,
+ bool with_irqfd)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ VirtQueue *vq = virtio_get_queue(vdev, n);
+ EventNotifier *notifier = virtio_queue_get_guest_notifier(vq);
+
+ if (assign) {
+ int r = event_notifier_init(notifier, 0);
+ if (r < 0) {
+ return r;
+ }
+ virtio_queue_set_guest_notifier_fd_handler(vq, true, with_irqfd);
+ } else {
+ virtio_queue_set_guest_notifier_fd_handler(vq, false, with_irqfd);
+ event_notifier_cleanup(notifier);
+ }
+
+ if (!msix_enabled(&proxy->pci_dev) &&
+ vdev->use_guest_notifier_mask &&
+ vdc->guest_notifier_mask) {
+ vdc->guest_notifier_mask(vdev, n, !assign);
+ }
+
+ return 0;
+}
+
+static bool virtio_pci_query_guest_notifiers(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ return msix_enabled(&proxy->pci_dev);
+}
+
+static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ int r, n;
+ bool with_irqfd = msix_enabled(&proxy->pci_dev) &&
+ kvm_msi_via_irqfd_enabled();
+
+ nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX);
+
+ /*
+ * When deassigning, pass a consistent nvqs value to avoid leaking
+ * notifiers. But first check we've actually been configured, exit
+ * early if we haven't.
+ */
+ if (!assign && !proxy->nvqs_with_notifiers) {
+ return 0;
+ }
+ assert(assign || nvqs == proxy->nvqs_with_notifiers);
+
+ proxy->nvqs_with_notifiers = nvqs;
+
+ /* Must unset vector notifier while guest notifier is still assigned */
+ if ((proxy->vector_irqfd || k->guest_notifier_mask) && !assign) {
+ msix_unset_vector_notifiers(&proxy->pci_dev);
+ if (proxy->vector_irqfd) {
+ kvm_virtio_pci_vector_release(proxy, nvqs);
+ g_free(proxy->vector_irqfd);
+ proxy->vector_irqfd = NULL;
+ }
+ }
+
+ for (n = 0; n < nvqs; n++) {
+ if (!virtio_queue_get_num(vdev, n)) {
+ break;
+ }
+
+ r = virtio_pci_set_guest_notifier(d, n, assign, with_irqfd);
+ if (r < 0) {
+ goto assign_error;
+ }
+ }
+
+ /* Must set vector notifier after guest notifier has been assigned */
+ if ((with_irqfd || k->guest_notifier_mask) && assign) {
+ if (with_irqfd) {
+ proxy->vector_irqfd =
+ g_malloc0(sizeof(*proxy->vector_irqfd) *
+ msix_nr_vectors_allocated(&proxy->pci_dev));
+ r = kvm_virtio_pci_vector_use(proxy, nvqs);
+ if (r < 0) {
+ goto assign_error;
+ }
+ }
+ r = msix_set_vector_notifiers(&proxy->pci_dev,
+ virtio_pci_vector_unmask,
+ virtio_pci_vector_mask,
+ virtio_pci_vector_poll);
+ if (r < 0) {
+ goto notifiers_error;
+ }
+ }
+
+ return 0;
+
+notifiers_error:
+ if (with_irqfd) {
+ assert(assign);
+ kvm_virtio_pci_vector_release(proxy, nvqs);
+ }
+
+assign_error:
+ /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */
+ assert(assign);
+ while (--n >= 0) {
+ virtio_pci_set_guest_notifier(d, n, !assign, with_irqfd);
+ }
+ return r;
+}
+
+static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n,
+ MemoryRegion *mr, bool assign)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ int offset;
+
+ if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) ||
+ virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) {
+ return -1;
+ }
+
+ if (assign) {
+ offset = virtio_pci_queue_mem_mult(proxy) * n;
+ memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1);
+ } else {
+ memory_region_del_subregion(&proxy->notify.mr, mr);
+ }
+
+ return 0;
+}
+
+static void virtio_pci_vmstate_change(DeviceState *d, bool running)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (running) {
+ /* Old QEMU versions did not set bus master enable on status write.
+ * Detect DRIVER set and enable it.
+ */
+ if ((proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION) &&
+ (vdev->status & VIRTIO_CONFIG_S_DRIVER) &&
+ !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+ pci_default_write_config(&proxy->pci_dev, PCI_COMMAND,
+ proxy->pci_dev.config[PCI_COMMAND] |
+ PCI_COMMAND_MASTER, 1);
+ }
+ virtio_pci_start_ioeventfd(proxy);
+ } else {
+ virtio_pci_stop_ioeventfd(proxy);
+ }
+}
+
+/*
+ * virtio-pci: This is the PCIDevice which has a virtio-pci-bus.
+ */
+
+static int virtio_pci_query_nvectors(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+
+ return proxy->nvectors;
+}
+
+static AddressSpace *virtio_pci_get_dma_as(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ PCIDevice *dev = &proxy->pci_dev;
+
+ return pci_get_address_space(dev);
+}
+
+static bool virtio_pci_iommu_enabled(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ PCIDevice *dev = &proxy->pci_dev;
+ AddressSpace *dma_as = pci_device_iommu_address_space(dev);
+
+ if (dma_as == &address_space_memory) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool virtio_pci_queue_enabled(DeviceState *d, int n)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ return proxy->vqs[n].enabled;
+ }
+
+ return virtio_queue_enabled_legacy(vdev, n);
+}
+
+static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
+ struct virtio_pci_cap *cap)
+{
+ PCIDevice *dev = &proxy->pci_dev;
+ int offset;
+
+ offset = pci_add_capability(dev, PCI_CAP_ID_VNDR, 0,
+ cap->cap_len, &error_abort);
+
+ assert(cap->cap_len >= sizeof *cap);
+ memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len,
+ cap->cap_len - PCI_CAP_FLAGS);
+
+ return offset;
+}
+
+static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint32_t val = 0;
+ int i;
+
+ if (vdev == NULL) {
+ return UINT64_MAX;
+ }
+
+ switch (addr) {
+ case VIRTIO_PCI_COMMON_DFSELECT:
+ val = proxy->dfselect;
+ break;
+ case VIRTIO_PCI_COMMON_DF:
+ if (proxy->dfselect <= 1) {
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ val = (vdev->host_features & ~vdc->legacy_features) >>
+ (32 * proxy->dfselect);
+ }
+ break;
+ case VIRTIO_PCI_COMMON_GFSELECT:
+ val = proxy->gfselect;
+ break;
+ case VIRTIO_PCI_COMMON_GF:
+ if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) {
+ val = proxy->guest_features[proxy->gfselect];
+ }
+ break;
+ case VIRTIO_PCI_COMMON_MSIX:
+ val = vdev->config_vector;
+ break;
+ case VIRTIO_PCI_COMMON_NUMQ:
+ for (i = 0; i < VIRTIO_QUEUE_MAX; ++i) {
+ if (virtio_queue_get_num(vdev, i)) {
+ val = i + 1;
+ }
+ }
+ break;
+ case VIRTIO_PCI_COMMON_STATUS:
+ val = vdev->status;
+ break;
+ case VIRTIO_PCI_COMMON_CFGGENERATION:
+ val = vdev->generation;
+ break;
+ case VIRTIO_PCI_COMMON_Q_SELECT:
+ val = vdev->queue_sel;
+ break;
+ case VIRTIO_PCI_COMMON_Q_SIZE:
+ val = virtio_queue_get_num(vdev, vdev->queue_sel);
+ break;
+ case VIRTIO_PCI_COMMON_Q_MSIX:
+ val = virtio_queue_vector(vdev, vdev->queue_sel);
+ break;
+ case VIRTIO_PCI_COMMON_Q_ENABLE:
+ val = proxy->vqs[vdev->queue_sel].enabled;
+ break;
+ case VIRTIO_PCI_COMMON_Q_NOFF:
+ /* Simply map queues in order */
+ val = vdev->queue_sel;
+ break;
+ case VIRTIO_PCI_COMMON_Q_DESCLO:
+ val = proxy->vqs[vdev->queue_sel].desc[0];
+ break;
+ case VIRTIO_PCI_COMMON_Q_DESCHI:
+ val = proxy->vqs[vdev->queue_sel].desc[1];
+ break;
+ case VIRTIO_PCI_COMMON_Q_AVAILLO:
+ val = proxy->vqs[vdev->queue_sel].avail[0];
+ break;
+ case VIRTIO_PCI_COMMON_Q_AVAILHI:
+ val = proxy->vqs[vdev->queue_sel].avail[1];
+ break;
+ case VIRTIO_PCI_COMMON_Q_USEDLO:
+ val = proxy->vqs[vdev->queue_sel].used[0];
+ break;
+ case VIRTIO_PCI_COMMON_Q_USEDHI:
+ val = proxy->vqs[vdev->queue_sel].used[1];
+ break;
+ case VIRTIO_PCI_COMMON_Q_RESET:
+ val = proxy->vqs[vdev->queue_sel].reset;
+ break;
+ default:
+ val = 0;
+ }
+
+ return val;
+}
+
+static void virtio_pci_common_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint16_t vector;
+
+ if (vdev == NULL) {
+ return;
+ }
+
+ switch (addr) {
+ case VIRTIO_PCI_COMMON_DFSELECT:
+ proxy->dfselect = val;
+ break;
+ case VIRTIO_PCI_COMMON_GFSELECT:
+ proxy->gfselect = val;
+ break;
+ case VIRTIO_PCI_COMMON_GF:
+ if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) {
+ proxy->guest_features[proxy->gfselect] = val;
+ virtio_set_features(vdev,
+ (((uint64_t)proxy->guest_features[1]) << 32) |
+ proxy->guest_features[0]);
+ }
+ break;
+ case VIRTIO_PCI_COMMON_MSIX:
+ if (vdev->config_vector != VIRTIO_NO_VECTOR) {
+ msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
+ }
+ /* Make it possible for guest to discover an error took place. */
+ if (val < proxy->nvectors) {
+ msix_vector_use(&proxy->pci_dev, val);
+ } else {
+ val = VIRTIO_NO_VECTOR;
+ }
+ vdev->config_vector = val;
+ break;
+ case VIRTIO_PCI_COMMON_STATUS:
+ if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ virtio_pci_stop_ioeventfd(proxy);
+ }
+
+ virtio_set_status(vdev, val & 0xFF);
+
+ if (val & VIRTIO_CONFIG_S_DRIVER_OK) {
+ virtio_pci_start_ioeventfd(proxy);
+ }
+
+ if (vdev->status == 0) {
+ virtio_pci_reset(DEVICE(proxy));
+ }
+
+ break;
+ case VIRTIO_PCI_COMMON_Q_SELECT:
+ if (val < VIRTIO_QUEUE_MAX) {
+ vdev->queue_sel = val;
+ }
+ break;
+ case VIRTIO_PCI_COMMON_Q_SIZE:
+ proxy->vqs[vdev->queue_sel].num = val;
+ virtio_queue_set_num(vdev, vdev->queue_sel,
+ proxy->vqs[vdev->queue_sel].num);
+ break;
+ case VIRTIO_PCI_COMMON_Q_MSIX:
+ vector = virtio_queue_vector(vdev, vdev->queue_sel);
+ if (vector != VIRTIO_NO_VECTOR) {
+ msix_vector_unuse(&proxy->pci_dev, vector);
+ }
+ /* Make it possible for guest to discover an error took place. */
+ if (val < proxy->nvectors) {
+ msix_vector_use(&proxy->pci_dev, val);
+ } else {
+ val = VIRTIO_NO_VECTOR;
+ }
+ virtio_queue_set_vector(vdev, vdev->queue_sel, val);
+ break;
+ case VIRTIO_PCI_COMMON_Q_ENABLE:
+ if (val == 1) {
+ virtio_queue_set_num(vdev, vdev->queue_sel,
+ proxy->vqs[vdev->queue_sel].num);
+ virtio_queue_set_rings(vdev, vdev->queue_sel,
+ ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].desc[0],
+ ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].avail[0],
+ ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
+ proxy->vqs[vdev->queue_sel].used[0]);
+ proxy->vqs[vdev->queue_sel].enabled = 1;
+ proxy->vqs[vdev->queue_sel].reset = 0;
+ virtio_queue_enable(vdev, vdev->queue_sel);
+ } else {
+ virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
+ }
+ break;
+ case VIRTIO_PCI_COMMON_Q_DESCLO:
+ proxy->vqs[vdev->queue_sel].desc[0] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_DESCHI:
+ proxy->vqs[vdev->queue_sel].desc[1] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_AVAILLO:
+ proxy->vqs[vdev->queue_sel].avail[0] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_AVAILHI:
+ proxy->vqs[vdev->queue_sel].avail[1] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_USEDLO:
+ proxy->vqs[vdev->queue_sel].used[0] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_USEDHI:
+ proxy->vqs[vdev->queue_sel].used[1] = val;
+ break;
+ case VIRTIO_PCI_COMMON_Q_RESET:
+ if (val == 1) {
+ proxy->vqs[vdev->queue_sel].reset = 1;
+
+ virtio_queue_reset(vdev, vdev->queue_sel);
+
+ proxy->vqs[vdev->queue_sel].reset = 0;
+ proxy->vqs[vdev->queue_sel].enabled = 0;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+
+static uint64_t virtio_pci_notify_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ if (virtio_bus_get_device(&proxy->bus) == NULL) {
+ return UINT64_MAX;
+ }
+
+ return 0;
+}
+
+static void virtio_pci_notify_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ unsigned queue = addr / virtio_pci_queue_mem_mult(proxy);
+
+ if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) {
+ trace_virtio_pci_notify_write(addr, val, size);
+ virtio_queue_notify(vdev, queue);
+ }
+}
+
+static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ unsigned queue = val;
+
+ if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) {
+ trace_virtio_pci_notify_write_pio(addr, val, size);
+ virtio_queue_notify(vdev, queue);
+ }
+}
+
+static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint64_t val;
+
+ if (vdev == NULL) {
+ return UINT64_MAX;
+ }
+
+ val = qatomic_xchg(&vdev->isr, 0);
+ pci_irq_deassert(&proxy->pci_dev);
+ return val;
+}
+
+static void virtio_pci_isr_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+}
+
+static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr,
+ unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+ uint64_t val;
+
+ if (vdev == NULL) {
+ return UINT64_MAX;
+ }
+
+ switch (size) {
+ case 1:
+ val = virtio_config_modern_readb(vdev, addr);
+ break;
+ case 2:
+ val = virtio_config_modern_readw(vdev, addr);
+ break;
+ case 4:
+ val = virtio_config_modern_readl(vdev, addr);
+ break;
+ default:
+ val = 0;
+ break;
+ }
+ return val;
+}
+
+static void virtio_pci_device_write(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIOPCIProxy *proxy = opaque;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (vdev == NULL) {
+ return;
+ }
+
+ switch (size) {
+ case 1:
+ virtio_config_modern_writeb(vdev, addr, val);
+ break;
+ case 2:
+ virtio_config_modern_writew(vdev, addr, val);
+ break;
+ case 4:
+ virtio_config_modern_writel(vdev, addr, val);
+ break;
+ }
+}
+
+static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy,
+ const char *vdev_name)
+{
+ static const MemoryRegionOps common_ops = {
+ .read = virtio_pci_common_read,
+ .write = virtio_pci_common_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+ static const MemoryRegionOps isr_ops = {
+ .read = virtio_pci_isr_read,
+ .write = virtio_pci_isr_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+ static const MemoryRegionOps device_ops = {
+ .read = virtio_pci_device_read,
+ .write = virtio_pci_device_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+ static const MemoryRegionOps notify_ops = {
+ .read = virtio_pci_notify_read,
+ .write = virtio_pci_notify_write,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+ static const MemoryRegionOps notify_pio_ops = {
+ .read = virtio_pci_notify_read,
+ .write = virtio_pci_notify_write_pio,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+ g_autoptr(GString) name = g_string_new(NULL);
+
+ g_string_printf(name, "virtio-pci-common-%s", vdev_name);
+ memory_region_init_io(&proxy->common.mr, OBJECT(proxy),
+ &common_ops,
+ proxy,
+ name->str,
+ proxy->common.size);
+
+ g_string_printf(name, "virtio-pci-isr-%s", vdev_name);
+ memory_region_init_io(&proxy->isr.mr, OBJECT(proxy),
+ &isr_ops,
+ proxy,
+ name->str,
+ proxy->isr.size);
+
+ g_string_printf(name, "virtio-pci-device-%s", vdev_name);
+ memory_region_init_io(&proxy->device.mr, OBJECT(proxy),
+ &device_ops,
+ proxy,
+ name->str,
+ proxy->device.size);
+
+ g_string_printf(name, "virtio-pci-notify-%s", vdev_name);
+ memory_region_init_io(&proxy->notify.mr, OBJECT(proxy),
+ &notify_ops,
+ proxy,
+ name->str,
+ proxy->notify.size);
+
+ g_string_printf(name, "virtio-pci-notify-pio-%s", vdev_name);
+ memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy),
+ &notify_pio_ops,
+ proxy,
+ name->str,
+ proxy->notify_pio.size);
+}
+
+static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region,
+ struct virtio_pci_cap *cap,
+ MemoryRegion *mr,
+ uint8_t bar)
+{
+ memory_region_add_subregion(mr, region->offset, &region->mr);
+
+ cap->cfg_type = region->type;
+ cap->bar = bar;
+ cap->offset = cpu_to_le32(region->offset);
+ cap->length = cpu_to_le32(region->size);
+ virtio_pci_add_mem_cap(proxy, cap);
+
+}
+
+static void virtio_pci_modern_mem_region_map(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region,
+ struct virtio_pci_cap *cap)
+{
+ virtio_pci_modern_region_map(proxy, region, cap,
+ &proxy->modern_bar, proxy->modern_mem_bar_idx);
+}
+
+static void virtio_pci_modern_io_region_map(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region,
+ struct virtio_pci_cap *cap)
+{
+ virtio_pci_modern_region_map(proxy, region, cap,
+ &proxy->io_bar, proxy->modern_io_bar_idx);
+}
+
+static void virtio_pci_modern_mem_region_unmap(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region)
+{
+ memory_region_del_subregion(&proxy->modern_bar,
+ &region->mr);
+}
+
+static void virtio_pci_modern_io_region_unmap(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region)
+{
+ memory_region_del_subregion(&proxy->io_bar,
+ &region->mr);
+}
+
+static void virtio_pci_pre_plugged(DeviceState *d, Error **errp)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ if (virtio_pci_modern(proxy)) {
+ virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1);
+ }
+
+ virtio_add_feature(&vdev->host_features, VIRTIO_F_BAD_FEATURE);
+}
+
+/* This is called by virtio-bus just after the device is plugged. */
+static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ VirtioBusState *bus = &proxy->bus;
+ bool legacy = virtio_pci_legacy(proxy);
+ bool modern;
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
+ uint8_t *config;
+ uint32_t size;
+ VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
+
+ /*
+ * Virtio capabilities present without
+ * VIRTIO_F_VERSION_1 confuses guests
+ */
+ if (!proxy->ignore_backend_features &&
+ !virtio_has_feature(vdev->host_features, VIRTIO_F_VERSION_1)) {
+ virtio_pci_disable_modern(proxy);
+
+ if (!legacy) {
+ error_setg(errp, "Device doesn't support modern mode, and legacy"
+ " mode is disabled");
+ error_append_hint(errp, "Set disable-legacy to off\n");
+
+ return;
+ }
+ }
+
+ modern = virtio_pci_modern(proxy);
+
+ config = proxy->pci_dev.config;
+ if (proxy->class_code) {
+ pci_config_set_class(config, proxy->class_code);
+ }
+
+ if (legacy) {
+ if (!virtio_legacy_allowed(vdev)) {
+ /*
+ * To avoid migration issues, we allow legacy mode when legacy
+ * check is disabled in the old machine types (< 5.1).
+ */
+ if (virtio_legacy_check_disabled(vdev)) {
+ warn_report("device is modern-only, but for backward "
+ "compatibility legacy is allowed");
+ } else {
+ error_setg(errp,
+ "device is modern-only, use disable-legacy=on");
+ return;
+ }
+ }
+ if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
+ error_setg(errp, "VIRTIO_F_IOMMU_PLATFORM was supported by"
+ " neither legacy nor transitional device");
+ return;
+ }
+ /*
+ * Legacy and transitional devices use specific subsystem IDs.
+ * Note that the subsystem vendor ID (config + PCI_SUBSYSTEM_VENDOR_ID)
+ * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default.
+ */
+ pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus));
+ } else {
+ /* pure virtio-1.0 */
+ pci_set_word(config + PCI_VENDOR_ID,
+ PCI_VENDOR_ID_REDHAT_QUMRANET);
+ pci_set_word(config + PCI_DEVICE_ID,
+ PCI_DEVICE_ID_VIRTIO_10_BASE + virtio_bus_get_vdev_id(bus));
+ pci_config_set_revision(config, 1);
+ }
+ config[PCI_INTERRUPT_PIN] = 1;
+
+
+ if (modern) {
+ struct virtio_pci_cap cap = {
+ .cap_len = sizeof cap,
+ };
+ struct virtio_pci_notify_cap notify = {
+ .cap.cap_len = sizeof notify,
+ .notify_off_multiplier =
+ cpu_to_le32(virtio_pci_queue_mem_mult(proxy)),
+ };
+ struct virtio_pci_cfg_cap cfg = {
+ .cap.cap_len = sizeof cfg,
+ .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG,
+ };
+ struct virtio_pci_notify_cap notify_pio = {
+ .cap.cap_len = sizeof notify,
+ .notify_off_multiplier = cpu_to_le32(0x0),
+ };
+
+ struct virtio_pci_cfg_cap *cfg_mask;
+
+ virtio_pci_modern_regions_init(proxy, vdev->name);
+
+ virtio_pci_modern_mem_region_map(proxy, &proxy->common, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
+
+ if (modern_pio) {
+ memory_region_init(&proxy->io_bar, OBJECT(proxy),
+ "virtio-pci-io", 0x4);
+
+ pci_register_bar(&proxy->pci_dev, proxy->modern_io_bar_idx,
+ PCI_BASE_ADDRESS_SPACE_IO, &proxy->io_bar);
+
+ virtio_pci_modern_io_region_map(proxy, &proxy->notify_pio,
+ &notify_pio.cap);
+ }
+
+ pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar_idx,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_PREFETCH |
+ PCI_BASE_ADDRESS_MEM_TYPE_64,
+ &proxy->modern_bar);
+
+ proxy->config_cap = virtio_pci_add_mem_cap(proxy, &cfg.cap);
+ cfg_mask = (void *)(proxy->pci_dev.wmask + proxy->config_cap);
+ pci_set_byte(&cfg_mask->cap.bar, ~0x0);
+ pci_set_long((uint8_t *)&cfg_mask->cap.offset, ~0x0);
+ pci_set_long((uint8_t *)&cfg_mask->cap.length, ~0x0);
+ pci_set_long(cfg_mask->pci_cfg_data, ~0x0);
+ }
+
+ if (proxy->nvectors) {
+ int err = msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors,
+ proxy->msix_bar_idx, NULL);
+ if (err) {
+ /* Notice when a system that supports MSIx can't initialize it */
+ if (err != -ENOTSUP) {
+ warn_report("unable to init msix vectors to %" PRIu32,
+ proxy->nvectors);
+ }
+ proxy->nvectors = 0;
+ }
+ }
+
+ proxy->pci_dev.config_write = virtio_write_config;
+ proxy->pci_dev.config_read = virtio_read_config;
+
+ if (legacy) {
+ size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev)
+ + virtio_bus_get_vdev_config_len(bus);
+ size = pow2ceil(size);
+
+ memory_region_init_io(&proxy->bar, OBJECT(proxy),
+ &virtio_pci_config_ops,
+ proxy, "virtio-pci", size);
+
+ pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx,
+ PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar);
+ }
+}
+
+static void virtio_pci_device_unplugged(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+ bool modern = virtio_pci_modern(proxy);
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
+
+ virtio_pci_stop_ioeventfd(proxy);
+
+ if (modern) {
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->common);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
+ if (modern_pio) {
+ virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
+ }
+ }
+}
+
+static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+ VirtioPCIClass *k = VIRTIO_PCI_GET_CLASS(pci_dev);
+ bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) &&
+ !pci_bus_is_root(pci_get_bus(pci_dev));
+
+ if (kvm_enabled() && !kvm_has_many_ioeventfds()) {
+ proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
+ }
+
+ /* fd-based ioevents can't be synchronized in record/replay */
+ if (replay_mode != REPLAY_MODE_NONE) {
+ proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
+ }
+
+ /*
+ * virtio pci bar layout used by default.
+ * subclasses can re-arrange things if needed.
+ *
+ * region 0 -- virtio legacy io bar
+ * region 1 -- msi-x bar
+ * region 2 -- virtio modern io bar (off by default)
+ * region 4+5 -- virtio modern memory (64bit) bar
+ *
+ */
+ proxy->legacy_io_bar_idx = 0;
+ proxy->msix_bar_idx = 1;
+ proxy->modern_io_bar_idx = 2;
+ proxy->modern_mem_bar_idx = 4;
+
+ proxy->common.offset = 0x0;
+ proxy->common.size = 0x1000;
+ proxy->common.type = VIRTIO_PCI_CAP_COMMON_CFG;
+
+ proxy->isr.offset = 0x1000;
+ proxy->isr.size = 0x1000;
+ proxy->isr.type = VIRTIO_PCI_CAP_ISR_CFG;
+
+ proxy->device.offset = 0x2000;
+ proxy->device.size = 0x1000;
+ proxy->device.type = VIRTIO_PCI_CAP_DEVICE_CFG;
+
+ proxy->notify.offset = 0x3000;
+ proxy->notify.size = virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX;
+ proxy->notify.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
+
+ proxy->notify_pio.offset = 0x0;
+ proxy->notify_pio.size = 0x4;
+ proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
+
+ /* subclasses can enforce modern, so do this unconditionally */
+ memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
+ /* PCI BAR regions must be powers of 2 */
+ pow2ceil(proxy->notify.offset + proxy->notify.size));
+
+ if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
+ proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+ }
+
+ if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) {
+ error_setg(errp, "device cannot work as neither modern nor legacy mode"
+ " is enabled");
+ error_append_hint(errp, "Set either disable-modern or disable-legacy"
+ " to off\n");
+ return;
+ }
+
+ if (pcie_port && pci_is_express(pci_dev)) {
+ int pos;
+ uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE;
+
+ pos = pcie_endpoint_cap_init(pci_dev, 0);
+ assert(pos > 0);
+
+ pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0,
+ PCI_PM_SIZEOF, errp);
+ if (pos < 0) {
+ return;
+ }
+
+ pci_dev->exp.pm_cap = pos;
+
+ /*
+ * Indicates that this function complies with revision 1.2 of the
+ * PCI Power Management Interface Specification.
+ */
+ pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3);
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_AER) {
+ pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset,
+ PCI_ERR_SIZEOF, NULL);
+ last_pcie_cap_offset += PCI_ERR_SIZEOF;
+ }
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) {
+ /* Init error enabling flags */
+ pcie_cap_deverr_init(pci_dev);
+ }
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_INIT_LNKCTL) {
+ /* Init Link Control Register */
+ pcie_cap_lnkctl_init(pci_dev);
+ }
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) {
+ /* Init Power Management Control Register */
+ pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+ }
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_ATS) {
+ pcie_ats_init(pci_dev, last_pcie_cap_offset,
+ proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED);
+ last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF;
+ }
+
+ if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) {
+ /* Set Function Level Reset capability bit */
+ pcie_cap_flr_init(pci_dev);
+ }
+ } else {
+ /*
+ * make future invocations of pci_is_express() return false
+ * and pci_config_size() return PCI_CONFIG_SPACE_SIZE.
+ */
+ pci_dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS;
+ }
+
+ virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy);
+ if (k->realize) {
+ k->realize(proxy, errp);
+ }
+}
+
+static void virtio_pci_exit(PCIDevice *pci_dev)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev);
+ bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) &&
+ !pci_bus_is_root(pci_get_bus(pci_dev));
+
+ msix_uninit_exclusive_bar(pci_dev);
+ if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port &&
+ pci_is_express(pci_dev)) {
+ pcie_aer_exit(pci_dev);
+ }
+}
+
+static void virtio_pci_reset(DeviceState *qdev)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev);
+ VirtioBusState *bus = VIRTIO_BUS(&proxy->bus);
+ int i;
+
+ virtio_bus_reset(bus);
+ msix_unuse_all_vectors(&proxy->pci_dev);
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ proxy->vqs[i].enabled = 0;
+ proxy->vqs[i].reset = 0;
+ proxy->vqs[i].num = 0;
+ proxy->vqs[i].desc[0] = proxy->vqs[i].desc[1] = 0;
+ proxy->vqs[i].avail[0] = proxy->vqs[i].avail[1] = 0;
+ proxy->vqs[i].used[0] = proxy->vqs[i].used[1] = 0;
+ }
+}
+
+static void virtio_pci_bus_reset(DeviceState *qdev)
+{
+ PCIDevice *dev = PCI_DEVICE(qdev);
+
+ virtio_pci_reset(qdev);
+
+ if (pci_is_express(dev)) {
+ pcie_cap_deverr_reset(dev);
+ pcie_cap_lnkctl_reset(dev);
+
+ pci_set_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL, 0);
+ }
+}
+
+static Property virtio_pci_properties[] = {
+ DEFINE_PROP_BIT("virtio-pci-bus-master-bug-migration", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT, false),
+ DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true),
+ DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false),
+ DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false),
+ DEFINE_PROP_BIT("page-per-vq", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_PAGE_PER_VQ_BIT, false),
+ DEFINE_PROP_BOOL("x-ignore-backend-features", VirtIOPCIProxy,
+ ignore_backend_features, false),
+ DEFINE_PROP_BIT("ats", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_ATS_BIT, false),
+ DEFINE_PROP_BIT("x-ats-page-aligned", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT, true),
+ DEFINE_PROP_BIT("x-pcie-deverr-init", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_INIT_DEVERR_BIT, true),
+ DEFINE_PROP_BIT("x-pcie-lnkctl-init", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_INIT_LNKCTL_BIT, true),
+ DEFINE_PROP_BIT("x-pcie-pm-init", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_INIT_PM_BIT, true),
+ DEFINE_PROP_BIT("x-pcie-flr-init", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_INIT_FLR_BIT, true),
+ DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_AER_BIT, false),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp)
+{
+ VirtioPCIClass *vpciklass = VIRTIO_PCI_GET_CLASS(qdev);
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev);
+ PCIDevice *pci_dev = &proxy->pci_dev;
+
+ if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) &&
+ virtio_pci_modern(proxy)) {
+ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
+ }
+
+ vpciklass->parent_dc_realize(qdev, errp);
+}
+
+static void virtio_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+ VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass);
+
+ device_class_set_props(dc, virtio_pci_properties);
+ k->realize = virtio_pci_realize;
+ k->exit = virtio_pci_exit;
+ k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ k->revision = VIRTIO_PCI_ABI_VERSION;
+ k->class_id = PCI_CLASS_OTHERS;
+ device_class_set_parent_realize(dc, virtio_pci_dc_realize,
+ &vpciklass->parent_dc_realize);
+ dc->reset = virtio_pci_bus_reset;
+}
+
+static const TypeInfo virtio_pci_info = {
+ .name = TYPE_VIRTIO_PCI,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(VirtIOPCIProxy),
+ .class_init = virtio_pci_class_init,
+ .class_size = sizeof(VirtioPCIClass),
+ .abstract = true,
+};
+
+static Property virtio_pci_generic_properties[] = {
+ DEFINE_PROP_ON_OFF_AUTO("disable-legacy", VirtIOPCIProxy, disable_legacy,
+ ON_OFF_AUTO_AUTO),
+ DEFINE_PROP_BOOL("disable-modern", VirtIOPCIProxy, disable_modern, false),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_pci_base_class_init(ObjectClass *klass, void *data)
+{
+ const VirtioPCIDeviceTypeInfo *t = data;
+ if (t->class_init) {
+ t->class_init(klass, NULL);
+ }
+}
+
+static void virtio_pci_generic_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_pci_generic_properties);
+}
+
+static void virtio_pci_transitional_instance_init(Object *obj)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(obj);
+
+ proxy->disable_legacy = ON_OFF_AUTO_OFF;
+ proxy->disable_modern = false;
+}
+
+static void virtio_pci_non_transitional_instance_init(Object *obj)
+{
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(obj);
+
+ proxy->disable_legacy = ON_OFF_AUTO_ON;
+ proxy->disable_modern = false;
+}
+
+void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t)
+{
+ char *base_name = NULL;
+ TypeInfo base_type_info = {
+ .name = t->base_name,
+ .parent = t->parent ? t->parent : TYPE_VIRTIO_PCI,
+ .instance_size = t->instance_size,
+ .instance_init = t->instance_init,
+ .class_size = t->class_size,
+ .abstract = true,
+ .interfaces = t->interfaces,
+ };
+ TypeInfo generic_type_info = {
+ .name = t->generic_name,
+ .parent = base_type_info.name,
+ .class_init = virtio_pci_generic_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_PCIE_DEVICE },
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { }
+ },
+ };
+
+ if (!base_type_info.name) {
+ /* No base type -> register a single generic device type */
+ /* use intermediate %s-base-type to add generic device props */
+ base_name = g_strdup_printf("%s-base-type", t->generic_name);
+ base_type_info.name = base_name;
+ base_type_info.class_init = virtio_pci_generic_class_init;
+
+ generic_type_info.parent = base_name;
+ generic_type_info.class_init = virtio_pci_base_class_init;
+ generic_type_info.class_data = (void *)t;
+
+ assert(!t->non_transitional_name);
+ assert(!t->transitional_name);
+ } else {
+ base_type_info.class_init = virtio_pci_base_class_init;
+ base_type_info.class_data = (void *)t;
+ }
+
+ type_register(&base_type_info);
+ if (generic_type_info.name) {
+ type_register(&generic_type_info);
+ }
+
+ if (t->non_transitional_name) {
+ const TypeInfo non_transitional_type_info = {
+ .name = t->non_transitional_name,
+ .parent = base_type_info.name,
+ .instance_init = virtio_pci_non_transitional_instance_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_PCIE_DEVICE },
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { }
+ },
+ };
+ type_register(&non_transitional_type_info);
+ }
+
+ if (t->transitional_name) {
+ const TypeInfo transitional_type_info = {
+ .name = t->transitional_name,
+ .parent = base_type_info.name,
+ .instance_init = virtio_pci_transitional_instance_init,
+ .interfaces = (InterfaceInfo[]) {
+ /*
+ * Transitional virtio devices work only as Conventional PCI
+ * devices because they require PIO ports.
+ */
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { }
+ },
+ };
+ type_register(&transitional_type_info);
+ }
+ g_free(base_name);
+}
+
+unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues)
+{
+ /*
+ * 1:1 vq to vCPU mapping is ideal because the same vCPU that submitted
+ * virtqueue buffers can handle their completion. When a different vCPU
+ * handles completion it may need to IPI the vCPU that submitted the
+ * request and this adds overhead.
+ *
+ * Virtqueues consume guest RAM and MSI-X vectors. This is wasteful in
+ * guests with very many vCPUs and a device that is only used by a few
+ * vCPUs. Unfortunately optimizing that case requires manual pinning inside
+ * the guest, so those users might as well manually set the number of
+ * queues. There is no upper limit that can be applied automatically and
+ * doing so arbitrarily would result in a sudden performance drop once the
+ * threshold number of vCPUs is exceeded.
+ */
+ unsigned num_queues = current_machine->smp.cpus;
+
+ /*
+ * The maximum number of MSI-X vectors is PCI_MSIX_FLAGS_QSIZE + 1, but the
+ * config change interrupt and the fixed virtqueues must be taken into
+ * account too.
+ */
+ num_queues = MIN(num_queues, PCI_MSIX_FLAGS_QSIZE - fixed_queues);
+
+ /*
+ * There is a limit to how many virtqueues a device can have.
+ */
+ return MIN(num_queues, VIRTIO_QUEUE_MAX - fixed_queues);
+}
+
+/* virtio-pci-bus */
+
+static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
+ VirtIOPCIProxy *dev)
+{
+ DeviceState *qdev = DEVICE(dev);
+ char virtio_bus_name[] = "virtio-bus";
+
+ qbus_init(bus, bus_size, TYPE_VIRTIO_PCI_BUS, qdev, virtio_bus_name);
+}
+
+static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
+{
+ BusClass *bus_class = BUS_CLASS(klass);
+ VirtioBusClass *k = VIRTIO_BUS_CLASS(klass);
+ bus_class->max_dev = 1;
+ k->notify = virtio_pci_notify;
+ k->save_config = virtio_pci_save_config;
+ k->load_config = virtio_pci_load_config;
+ k->save_queue = virtio_pci_save_queue;
+ k->load_queue = virtio_pci_load_queue;
+ k->save_extra_state = virtio_pci_save_extra_state;
+ k->load_extra_state = virtio_pci_load_extra_state;
+ k->has_extra_state = virtio_pci_has_extra_state;
+ k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
+ k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
+ k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr;
+ k->vmstate_change = virtio_pci_vmstate_change;
+ k->pre_plugged = virtio_pci_pre_plugged;
+ k->device_plugged = virtio_pci_device_plugged;
+ k->device_unplugged = virtio_pci_device_unplugged;
+ k->query_nvectors = virtio_pci_query_nvectors;
+ k->ioeventfd_enabled = virtio_pci_ioeventfd_enabled;
+ k->ioeventfd_assign = virtio_pci_ioeventfd_assign;
+ k->get_dma_as = virtio_pci_get_dma_as;
+ k->iommu_enabled = virtio_pci_iommu_enabled;
+ k->queue_enabled = virtio_pci_queue_enabled;
+}
+
+static const TypeInfo virtio_pci_bus_info = {
+ .name = TYPE_VIRTIO_PCI_BUS,
+ .parent = TYPE_VIRTIO_BUS,
+ .instance_size = sizeof(VirtioPCIBusState),
+ .class_size = sizeof(VirtioPCIBusClass),
+ .class_init = virtio_pci_bus_class_init,
+};
+
+static void virtio_pci_register_types(void)
+{
+ /* Base types: */
+ type_register_static(&virtio_pci_bus_info);
+ type_register_static(&virtio_pci_info);
+}
+
+type_init(virtio_pci_register_types)
+
diff --git a/hw/virtio/virtio-pmem-pci.c b/hw/virtio/virtio-pmem-pci.c
new file mode 100644
index 00000000..7d9f4ec1
--- /dev/null
+++ b/hw/virtio/virtio-pmem-pci.c
@@ -0,0 +1,127 @@
+/*
+ * Virtio PMEM PCI device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ * Pankaj Gupta <pagupta@redhat.com>
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "virtio-pmem-pci.h"
+#include "hw/mem/memory-device.h"
+#include "qapi/error.h"
+
+static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOPMEMPCI *pmem_pci = VIRTIO_PMEM_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&pmem_pci->vdev);
+
+ virtio_pci_force_virtio_1(vpci_dev);
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_pmem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
+ Error **errp)
+{
+ object_property_set_uint(OBJECT(md), VIRTIO_PMEM_ADDR_PROP, addr, errp);
+}
+
+static uint64_t virtio_pmem_pci_get_addr(const MemoryDeviceState *md)
+{
+ return object_property_get_uint(OBJECT(md), VIRTIO_PMEM_ADDR_PROP,
+ &error_abort);
+}
+
+static MemoryRegion *virtio_pmem_pci_get_memory_region(MemoryDeviceState *md,
+ Error **errp)
+{
+ VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+ VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+ VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+
+ return vpc->get_memory_region(pmem, errp);
+}
+
+static uint64_t virtio_pmem_pci_get_plugged_size(const MemoryDeviceState *md,
+ Error **errp)
+{
+ VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+ VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+ VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+ MemoryRegion *mr = vpc->get_memory_region(pmem, errp);
+
+ /* the plugged size corresponds to the region size */
+ return mr ? memory_region_size(mr) : 0;
+}
+
+static void virtio_pmem_pci_fill_device_info(const MemoryDeviceState *md,
+ MemoryDeviceInfo *info)
+{
+ VirtioPMEMDeviceInfo *vi = g_new0(VirtioPMEMDeviceInfo, 1);
+ VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md);
+ VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev);
+ VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem);
+ DeviceState *dev = DEVICE(md);
+
+ if (dev->id) {
+ vi->has_id = true;
+ vi->id = g_strdup(dev->id);
+ }
+
+ /* let the real device handle everything else */
+ vpc->fill_device_info(pmem, vi);
+
+ info->u.virtio_pmem.data = vi;
+ info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM;
+}
+
+static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
+
+ k->realize = virtio_pmem_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+
+ mdc->get_addr = virtio_pmem_pci_get_addr;
+ mdc->set_addr = virtio_pmem_pci_set_addr;
+ mdc->get_plugged_size = virtio_pmem_pci_get_plugged_size;
+ mdc->get_memory_region = virtio_pmem_pci_get_memory_region;
+ mdc->fill_device_info = virtio_pmem_pci_fill_device_info;
+}
+
+static void virtio_pmem_pci_instance_init(Object *obj)
+{
+ VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_PMEM);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_pmem_pci_info = {
+ .base_name = TYPE_VIRTIO_PMEM_PCI,
+ .generic_name = "virtio-pmem-pci",
+ .instance_size = sizeof(VirtIOPMEMPCI),
+ .instance_init = virtio_pmem_pci_instance_init,
+ .class_init = virtio_pmem_pci_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_MEMORY_DEVICE },
+ { }
+ },
+};
+
+static void virtio_pmem_pci_register_types(void)
+{
+ virtio_pci_types_register(&virtio_pmem_pci_info);
+}
+type_init(virtio_pmem_pci_register_types)
diff --git a/hw/virtio/virtio-pmem-pci.h b/hw/virtio/virtio-pmem-pci.h
new file mode 100644
index 00000000..63cfe727
--- /dev/null
+++ b/hw/virtio/virtio-pmem-pci.h
@@ -0,0 +1,35 @@
+/*
+ * Virtio PMEM PCI device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ * Pankaj Gupta <pagupta@redhat.com>
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_VIRTIO_PMEM_PCI_H
+#define QEMU_VIRTIO_PMEM_PCI_H
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-pmem.h"
+#include "qom/object.h"
+
+typedef struct VirtIOPMEMPCI VirtIOPMEMPCI;
+
+/*
+ * virtio-pmem-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOPMEMPCI, VIRTIO_PMEM_PCI,
+ TYPE_VIRTIO_PMEM_PCI)
+
+struct VirtIOPMEMPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOPMEM vdev;
+};
+
+#endif /* QEMU_VIRTIO_PMEM_PCI_H */
diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c
new file mode 100644
index 00000000..a1abfe0e
--- /dev/null
+++ b/hw/virtio/virtio-pmem.c
@@ -0,0 +1,196 @@
+/*
+ * Virtio PMEM device
+ *
+ * Copyright (C) 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ * Pankaj Gupta <pagupta@redhat.com>
+ * David Hildenbrand <david@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "hw/virtio/virtio-pmem.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-access.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "standard-headers/linux/virtio_pmem.h"
+#include "sysemu/hostmem.h"
+#include "block/aio.h"
+#include "block/thread-pool.h"
+#include "trace.h"
+
+typedef struct VirtIODeviceRequest {
+ VirtQueueElement elem;
+ int fd;
+ VirtIOPMEM *pmem;
+ VirtIODevice *vdev;
+ struct virtio_pmem_req req;
+ struct virtio_pmem_resp resp;
+} VirtIODeviceRequest;
+
+static int worker_cb(void *opaque)
+{
+ VirtIODeviceRequest *req_data = opaque;
+ int err = 0;
+
+ /* flush raw backing image */
+ err = fsync(req_data->fd);
+ trace_virtio_pmem_flush_done(err);
+ if (err != 0) {
+ err = 1;
+ }
+
+ virtio_stl_p(req_data->vdev, &req_data->resp.ret, err);
+
+ return 0;
+}
+
+static void done_cb(void *opaque, int ret)
+{
+ VirtIODeviceRequest *req_data = opaque;
+ int len = iov_from_buf(req_data->elem.in_sg, req_data->elem.in_num, 0,
+ &req_data->resp, sizeof(struct virtio_pmem_resp));
+
+ /* Callbacks are serialized, so no need to use atomic ops. */
+ virtqueue_push(req_data->pmem->rq_vq, &req_data->elem, len);
+ virtio_notify((VirtIODevice *)req_data->pmem, req_data->pmem->rq_vq);
+ trace_virtio_pmem_response();
+ g_free(req_data);
+}
+
+static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIODeviceRequest *req_data;
+ VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+ HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev);
+ ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+ trace_virtio_pmem_flush_request();
+ req_data = virtqueue_pop(vq, sizeof(VirtIODeviceRequest));
+ if (!req_data) {
+ virtio_error(vdev, "virtio-pmem missing request data");
+ return;
+ }
+
+ if (req_data->elem.out_num < 1 || req_data->elem.in_num < 1) {
+ virtio_error(vdev, "virtio-pmem request not proper");
+ virtqueue_detach_element(vq, (VirtQueueElement *)req_data, 0);
+ g_free(req_data);
+ return;
+ }
+ req_data->fd = memory_region_get_fd(&backend->mr);
+ req_data->pmem = pmem;
+ req_data->vdev = vdev;
+ thread_pool_submit_aio(pool, worker_cb, req_data, done_cb, req_data);
+}
+
+static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+ VirtIOPMEM *pmem = VIRTIO_PMEM(vdev);
+ struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *) config;
+
+ virtio_stq_p(vdev, &pmemcfg->start, pmem->start);
+ virtio_stq_p(vdev, &pmemcfg->size, memory_region_size(&pmem->memdev->mr));
+}
+
+static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t features,
+ Error **errp)
+{
+ return features;
+}
+
+static void virtio_pmem_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOPMEM *pmem = VIRTIO_PMEM(dev);
+
+ if (!pmem->memdev) {
+ error_setg(errp, "virtio-pmem memdev not set");
+ return;
+ }
+
+ if (host_memory_backend_is_mapped(pmem->memdev)) {
+ error_setg(errp, "can't use already busy memdev: %s",
+ object_get_canonical_path_component(OBJECT(pmem->memdev)));
+ return;
+ }
+
+ host_memory_backend_set_mapped(pmem->memdev, true);
+ virtio_init(vdev, VIRTIO_ID_PMEM, sizeof(struct virtio_pmem_config));
+ pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush);
+}
+
+static void virtio_pmem_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIOPMEM *pmem = VIRTIO_PMEM(dev);
+
+ host_memory_backend_set_mapped(pmem->memdev, false);
+ virtio_delete_queue(pmem->rq_vq);
+ virtio_cleanup(vdev);
+}
+
+static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem,
+ VirtioPMEMDeviceInfo *vi)
+{
+ vi->memaddr = pmem->start;
+ vi->size = memory_region_size(&pmem->memdev->mr);
+ vi->memdev = object_get_canonical_path(OBJECT(pmem->memdev));
+}
+
+static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem,
+ Error **errp)
+{
+ if (!pmem->memdev) {
+ error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP);
+ return NULL;
+ }
+
+ return &pmem->memdev->mr;
+}
+
+static Property virtio_pmem_properties[] = {
+ DEFINE_PROP_UINT64(VIRTIO_PMEM_ADDR_PROP, VirtIOPMEM, start, 0),
+ DEFINE_PROP_LINK(VIRTIO_PMEM_MEMDEV_PROP, VirtIOPMEM, memdev,
+ TYPE_MEMORY_BACKEND, HostMemoryBackend *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_pmem_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+ VirtIOPMEMClass *vpc = VIRTIO_PMEM_CLASS(klass);
+
+ device_class_set_props(dc, virtio_pmem_properties);
+
+ vdc->realize = virtio_pmem_realize;
+ vdc->unrealize = virtio_pmem_unrealize;
+ vdc->get_config = virtio_pmem_get_config;
+ vdc->get_features = virtio_pmem_get_features;
+
+ vpc->fill_device_info = virtio_pmem_fill_device_info;
+ vpc->get_memory_region = virtio_pmem_get_memory_region;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+}
+
+static const TypeInfo virtio_pmem_info = {
+ .name = TYPE_VIRTIO_PMEM,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .class_size = sizeof(VirtIOPMEMClass),
+ .class_init = virtio_pmem_class_init,
+ .instance_size = sizeof(VirtIOPMEM),
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_pmem_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-rng-pci.c b/hw/virtio/virtio-rng-pci.c
new file mode 100644
index 00000000..6e76f8b5
--- /dev/null
+++ b/hw/virtio/virtio-rng-pci.c
@@ -0,0 +1,96 @@
+/*
+ * Virtio rng PCI Bindings
+ *
+ * Copyright 2012 Red Hat, Inc.
+ * Copyright 2012 Amit Shah <amit.shah@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-rng.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qom/object.h"
+
+typedef struct VirtIORngPCI VirtIORngPCI;
+
+/*
+ * virtio-rng-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_RNG_PCI "virtio-rng-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIORngPCI, VIRTIO_RNG_PCI,
+ TYPE_VIRTIO_RNG_PCI)
+
+struct VirtIORngPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIORNG vdev;
+};
+
+static Property virtio_rng_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIORngPCI *vrng = VIRTIO_RNG_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&vrng->vdev);
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = 2;
+ }
+
+ if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) {
+ return;
+ }
+}
+
+static void virtio_rng_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = virtio_rng_pci_realize;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_RNG;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_OTHERS;
+ device_class_set_props(dc, virtio_rng_properties);
+}
+
+static void virtio_rng_initfn(Object *obj)
+{
+ VirtIORngPCI *dev = VIRTIO_RNG_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_RNG);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_rng_pci_info = {
+ .base_name = TYPE_VIRTIO_RNG_PCI,
+ .generic_name = "virtio-rng-pci",
+ .transitional_name = "virtio-rng-pci-transitional",
+ .non_transitional_name = "virtio-rng-pci-non-transitional",
+ .instance_size = sizeof(VirtIORngPCI),
+ .instance_init = virtio_rng_initfn,
+ .class_init = virtio_rng_pci_class_init,
+};
+
+static void virtio_rng_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_rng_pci_info);
+}
+
+type_init(virtio_rng_pci_register)
diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c
new file mode 100644
index 00000000..7e12fc03
--- /dev/null
+++ b/hw/virtio/virtio-rng.c
@@ -0,0 +1,289 @@
+/*
+ * A virtio device implementing a hardware random number generator.
+ *
+ * Copyright 2012 Red Hat, Inc.
+ * Copyright 2012 Amit Shah <amit.shah@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "hw/virtio/virtio.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-rng.h"
+#include "sysemu/rng.h"
+#include "sysemu/runstate.h"
+#include "qom/object_interfaces.h"
+#include "trace.h"
+
+static bool is_guest_ready(VirtIORNG *vrng)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(vrng);
+ if (virtio_queue_ready(vrng->vq)
+ && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ return true;
+ }
+ trace_virtio_rng_guest_not_ready(vrng);
+ return false;
+}
+
+static size_t get_request_size(VirtQueue *vq, unsigned quota)
+{
+ unsigned int in, out;
+
+ virtqueue_get_avail_bytes(vq, &in, &out, quota, 0);
+ return in;
+}
+
+static void virtio_rng_process(VirtIORNG *vrng);
+
+/* Send data from a char device over to the guest */
+static void chr_read(void *opaque, const void *buf, size_t size)
+{
+ VirtIORNG *vrng = opaque;
+ VirtIODevice *vdev = VIRTIO_DEVICE(vrng);
+ VirtQueueElement *elem;
+ size_t len;
+ int offset;
+
+ if (!is_guest_ready(vrng)) {
+ return;
+ }
+
+ /* we can't modify the virtqueue until
+ * our state is fully synced
+ */
+
+ if (!runstate_check(RUN_STATE_RUNNING)) {
+ trace_virtio_rng_cpu_is_stopped(vrng, size);
+ return;
+ }
+
+ vrng->quota_remaining -= size;
+
+ offset = 0;
+ while (offset < size) {
+ elem = virtqueue_pop(vrng->vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ break;
+ }
+ trace_virtio_rng_popped(vrng);
+ len = iov_from_buf(elem->in_sg, elem->in_num,
+ 0, buf + offset, size - offset);
+ offset += len;
+
+ virtqueue_push(vrng->vq, elem, len);
+ trace_virtio_rng_pushed(vrng, len);
+ g_free(elem);
+ }
+ virtio_notify(vdev, vrng->vq);
+
+ if (!virtio_queue_empty(vrng->vq)) {
+ /* If we didn't drain the queue, call virtio_rng_process
+ * to take care of asking for more data as appropriate.
+ */
+ virtio_rng_process(vrng);
+ }
+}
+
+static void virtio_rng_process(VirtIORNG *vrng)
+{
+ size_t size;
+ unsigned quota;
+
+ if (!is_guest_ready(vrng)) {
+ return;
+ }
+
+ if (vrng->activate_timer) {
+ timer_mod(vrng->rate_limit_timer,
+ qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vrng->conf.period_ms);
+ vrng->activate_timer = false;
+ }
+
+ if (vrng->quota_remaining < 0) {
+ quota = 0;
+ } else {
+ quota = MIN((uint64_t)vrng->quota_remaining, (uint64_t)UINT32_MAX);
+ }
+ size = get_request_size(vrng->vq, quota);
+
+ trace_virtio_rng_request(vrng, size, quota);
+
+ size = MIN(vrng->quota_remaining, size);
+ if (size) {
+ rng_backend_request_entropy(vrng->rng, size, chr_read, vrng);
+ }
+}
+
+static void handle_input(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIORNG *vrng = VIRTIO_RNG(vdev);
+ virtio_rng_process(vrng);
+}
+
+static uint64_t get_features(VirtIODevice *vdev, uint64_t f, Error **errp)
+{
+ return f;
+}
+
+static void virtio_rng_vm_state_change(void *opaque, bool running,
+ RunState state)
+{
+ VirtIORNG *vrng = opaque;
+
+ trace_virtio_rng_vm_state_change(vrng, running, state);
+
+ /* We may have an element ready but couldn't process it due to a quota
+ * limit or because CPU was stopped. Make sure to try again when the
+ * CPU restart.
+ */
+
+ if (running && is_guest_ready(vrng)) {
+ virtio_rng_process(vrng);
+ }
+}
+
+static void check_rate_limit(void *opaque)
+{
+ VirtIORNG *vrng = opaque;
+
+ vrng->quota_remaining = vrng->conf.max_bytes;
+ virtio_rng_process(vrng);
+ vrng->activate_timer = true;
+}
+
+static void virtio_rng_set_status(VirtIODevice *vdev, uint8_t status)
+{
+ VirtIORNG *vrng = VIRTIO_RNG(vdev);
+
+ if (!vdev->vm_running) {
+ return;
+ }
+ vdev->status = status;
+
+ /* Something changed, try to process buffers */
+ virtio_rng_process(vrng);
+}
+
+static void virtio_rng_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIORNG *vrng = VIRTIO_RNG(dev);
+
+ if (vrng->conf.period_ms <= 0) {
+ error_setg(errp, "'period' parameter expects a positive integer");
+ return;
+ }
+
+ /* Workaround: Property parsing does not enforce unsigned integers,
+ * So this is a hack to reject such numbers. */
+ if (vrng->conf.max_bytes > INT64_MAX) {
+ error_setg(errp, "'max-bytes' parameter must be non-negative, "
+ "and less than 2^63");
+ return;
+ }
+
+ if (vrng->conf.rng == NULL) {
+ Object *default_backend = object_new(TYPE_RNG_BUILTIN);
+
+ if (!user_creatable_complete(USER_CREATABLE(default_backend),
+ errp)) {
+ object_unref(default_backend);
+ return;
+ }
+
+ object_property_add_child(OBJECT(dev), "default-backend",
+ default_backend);
+
+ /* The child property took a reference, we can safely drop ours now */
+ object_unref(default_backend);
+
+ object_property_set_link(OBJECT(dev), "rng", default_backend,
+ &error_abort);
+ }
+
+ vrng->rng = vrng->conf.rng;
+ if (vrng->rng == NULL) {
+ error_setg(errp, "'rng' parameter expects a valid object");
+ return;
+ }
+
+ virtio_init(vdev, VIRTIO_ID_RNG, 0);
+
+ vrng->vq = virtio_add_queue(vdev, 8, handle_input);
+ vrng->quota_remaining = vrng->conf.max_bytes;
+ vrng->rate_limit_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
+ check_rate_limit, vrng);
+ vrng->activate_timer = true;
+
+ vrng->vmstate = qemu_add_vm_change_state_handler(virtio_rng_vm_state_change,
+ vrng);
+}
+
+static void virtio_rng_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtIORNG *vrng = VIRTIO_RNG(dev);
+
+ qemu_del_vm_change_state_handler(vrng->vmstate);
+ timer_free(vrng->rate_limit_timer);
+ virtio_del_queue(vdev, 0);
+ virtio_cleanup(vdev);
+}
+
+static const VMStateDescription vmstate_virtio_rng = {
+ .name = "virtio-rng",
+ .minimum_version_id = 1,
+ .version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VIRTIO_DEVICE,
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static Property virtio_rng_properties[] = {
+ /* Set a default rate limit of 2^47 bytes per minute or roughly 2TB/s. If
+ * you have an entropy source capable of generating more entropy than this
+ * and you can pass it through via virtio-rng, then hats off to you. Until
+ * then, this is unlimited for all practical purposes.
+ */
+ DEFINE_PROP_UINT64("max-bytes", VirtIORNG, conf.max_bytes, INT64_MAX),
+ DEFINE_PROP_UINT32("period", VirtIORNG, conf.period_ms, 1 << 16),
+ DEFINE_PROP_LINK("rng", VirtIORNG, conf.rng, TYPE_RNG_BACKEND, RngBackend *),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_rng_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+ device_class_set_props(dc, virtio_rng_properties);
+ dc->vmsd = &vmstate_virtio_rng;
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ vdc->realize = virtio_rng_device_realize;
+ vdc->unrealize = virtio_rng_device_unrealize;
+ vdc->get_features = get_features;
+ vdc->set_status = virtio_rng_set_status;
+}
+
+static const TypeInfo virtio_rng_info = {
+ .name = TYPE_VIRTIO_RNG,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIORNG),
+ .class_init = virtio_rng_class_init,
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_rng_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c
new file mode 100644
index 00000000..e8e3442f
--- /dev/null
+++ b/hw/virtio/virtio-scsi-pci.c
@@ -0,0 +1,114 @@
+/*
+ * Virtio scsi PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version. See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-scsi.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VirtIOSCSIPCI VirtIOSCSIPCI;
+
+/*
+ * virtio-scsi-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_SCSI_PCI "virtio-scsi-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOSCSIPCI, VIRTIO_SCSI_PCI,
+ TYPE_VIRTIO_SCSI_PCI)
+
+struct VirtIOSCSIPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOSCSI vdev;
+};
+
+static Property virtio_scsi_pci_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ DeviceState *proxy = DEVICE(vpci_dev);
+ VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf;
+ char *bus_name;
+
+ if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) {
+ conf->num_queues =
+ virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED);
+ }
+
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1;
+ }
+
+ /*
+ * For command line compatibility, this sets the virtio-scsi-device bus
+ * name as before.
+ */
+ if (proxy->id) {
+ bus_name = g_strdup_printf("%s.0", proxy->id);
+ virtio_device_set_child_bus_name(VIRTIO_DEVICE(vdev), bus_name);
+ g_free(bus_name);
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static void virtio_scsi_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = virtio_scsi_pci_realize;
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, virtio_scsi_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI;
+ pcidev_k->revision = 0x00;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void virtio_scsi_pci_instance_init(Object *obj)
+{
+ VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_SCSI);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_scsi_pci_info = {
+ .base_name = TYPE_VIRTIO_SCSI_PCI,
+ .generic_name = "virtio-scsi-pci",
+ .transitional_name = "virtio-scsi-pci-transitional",
+ .non_transitional_name = "virtio-scsi-pci-non-transitional",
+ .instance_size = sizeof(VirtIOSCSIPCI),
+ .instance_init = virtio_scsi_pci_instance_init,
+ .class_init = virtio_scsi_pci_class_init,
+};
+
+static void virtio_scsi_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_scsi_pci_info);
+}
+
+type_init(virtio_scsi_pci_register)
diff --git a/hw/virtio/virtio-serial-pci.c b/hw/virtio/virtio-serial-pci.c
new file mode 100644
index 00000000..cea31adc
--- /dev/null
+++ b/hw/virtio/virtio-serial-pci.c
@@ -0,0 +1,117 @@
+/*
+ * Virtio serial PCI Bindings
+ *
+ * Copyright IBM, Corp. 2007
+ * Copyright (c) 2009 CodeSourcery
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paul Brook <paul@codesourcery.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-serial.h"
+#include "qemu/module.h"
+#include "hw/virtio/virtio-pci.h"
+#include "qom/object.h"
+
+typedef struct VirtIOSerialPCI VirtIOSerialPCI;
+
+/*
+ * virtio-serial-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VIRTIO_SERIAL_PCI "virtio-serial-pci-base"
+DECLARE_INSTANCE_CHECKER(VirtIOSerialPCI, VIRTIO_SERIAL_PCI,
+ TYPE_VIRTIO_SERIAL_PCI)
+
+struct VirtIOSerialPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOSerial vdev;
+};
+
+static void virtio_serial_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+ VirtIOSerialPCI *dev = VIRTIO_SERIAL_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+ DeviceState *proxy = DEVICE(vpci_dev);
+ char *bus_name;
+
+ if (vpci_dev->class_code != PCI_CLASS_COMMUNICATION_OTHER &&
+ vpci_dev->class_code != PCI_CLASS_DISPLAY_OTHER && /* qemu 0.10 */
+ vpci_dev->class_code != PCI_CLASS_OTHERS) { /* qemu-kvm */
+ vpci_dev->class_code = PCI_CLASS_COMMUNICATION_OTHER;
+ }
+
+ /* backwards-compatibility with machines that were created with
+ DEV_NVECTORS_UNSPECIFIED */
+ if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+ vpci_dev->nvectors = dev->vdev.serial.max_virtserial_ports + 1;
+ }
+
+ /*
+ * For command line compatibility, this sets the virtio-serial-device bus
+ * name as before.
+ */
+ if (proxy->id) {
+ bus_name = g_strdup_printf("%s.0", proxy->id);
+ virtio_device_set_child_bus_name(VIRTIO_DEVICE(vdev), bus_name);
+ g_free(bus_name);
+ }
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+}
+
+static Property virtio_serial_pci_properties[] = {
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_serial_pci_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+ k->realize = virtio_serial_pci_realize;
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ device_class_set_props(dc, virtio_serial_pci_properties);
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_CONSOLE;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void virtio_serial_pci_instance_init(Object *obj)
+{
+ VirtIOSerialPCI *dev = VIRTIO_SERIAL_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_SERIAL);
+}
+
+static const VirtioPCIDeviceTypeInfo virtio_serial_pci_info = {
+ .base_name = TYPE_VIRTIO_SERIAL_PCI,
+ .generic_name = "virtio-serial-pci",
+ .transitional_name = "virtio-serial-pci-transitional",
+ .non_transitional_name = "virtio-serial-pci-non-transitional",
+ .instance_size = sizeof(VirtIOSerialPCI),
+ .instance_init = virtio_serial_pci_instance_init,
+ .class_init = virtio_serial_pci_class_init,
+};
+
+static void virtio_serial_pci_register(void)
+{
+ virtio_pci_types_register(&virtio_serial_pci_info);
+}
+
+type_init(virtio_serial_pci_register)
diff --git a/hw/virtio/virtio-stub.c b/hw/virtio/virtio-stub.c
new file mode 100644
index 00000000..7ddb22cc
--- /dev/null
+++ b/hw/virtio/virtio-stub.c
@@ -0,0 +1,42 @@
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-virtio.h"
+
+static void *qmp_virtio_unsupported(Error **errp)
+{
+ error_setg(errp, "Virtio is disabled");
+ return NULL;
+}
+
+VirtioInfoList *qmp_x_query_virtio(Error **errp)
+{
+ return qmp_virtio_unsupported(errp);
+}
+
+VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp)
+{
+ return qmp_virtio_unsupported(errp);
+}
+
+VirtVhostQueueStatus *qmp_x_query_virtio_vhost_queue_status(const char *path,
+ uint16_t queue,
+ Error **errp)
+{
+ return qmp_virtio_unsupported(errp);
+}
+
+VirtQueueStatus *qmp_x_query_virtio_queue_status(const char *path,
+ uint16_t queue,
+ Error **errp)
+{
+ return qmp_virtio_unsupported(errp);
+}
+
+VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path,
+ uint16_t queue,
+ bool has_index,
+ uint16_t index,
+ Error **errp)
+{
+ return qmp_virtio_unsupported(errp);
+}
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
new file mode 100644
index 00000000..eb6347ab
--- /dev/null
+++ b/hw/virtio/virtio.c
@@ -0,0 +1,4991 @@
+/*
+ * Virtio Support
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qapi-commands-virtio.h"
+#include "qapi/qapi-commands-qom.h"
+#include "qapi/qapi-visit-virtio.h"
+#include "qapi/qmp/qjson.h"
+#include "cpu.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qom/object_interfaces.h"
+#include "hw/virtio/virtio.h"
+#include "migration/qemu-file-types.h"
+#include "qemu/atomic.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-access.h"
+#include "sysemu/dma.h"
+#include "sysemu/runstate.h"
+#include "standard-headers/linux/virtio_ids.h"
+#include "standard-headers/linux/vhost_types.h"
+#include "standard-headers/linux/virtio_blk.h"
+#include "standard-headers/linux/virtio_console.h"
+#include "standard-headers/linux/virtio_gpu.h"
+#include "standard-headers/linux/virtio_net.h"
+#include "standard-headers/linux/virtio_scsi.h"
+#include "standard-headers/linux/virtio_i2c.h"
+#include "standard-headers/linux/virtio_balloon.h"
+#include "standard-headers/linux/virtio_iommu.h"
+#include "standard-headers/linux/virtio_mem.h"
+#include "standard-headers/linux/virtio_vsock.h"
+#include CONFIG_DEVICES
+
+/* QAPI list of realized VirtIODevices */
+static QTAILQ_HEAD(, VirtIODevice) virtio_list;
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+#define FEATURE_ENTRY(name, desc) (qmp_virtio_feature_map_t) \
+ { .virtio_bit = name, .feature_desc = desc }
+
+enum VhostUserProtocolFeature {
+ VHOST_USER_PROTOCOL_F_MQ = 0,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
+ VHOST_USER_PROTOCOL_F_RARP = 2,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
+ VHOST_USER_PROTOCOL_F_NET_MTU = 4,
+ VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5,
+ VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
+ VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
+ VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
+ VHOST_USER_PROTOCOL_F_CONFIG = 9,
+ VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
+ VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+ VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+ VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13,
+ VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14,
+ VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15,
+ VHOST_USER_PROTOCOL_F_MAX
+};
+
+/* Virtio transport features mapping */
+static qmp_virtio_feature_map_t virtio_transport_map[] = {
+ /* Virtio device transport features */
+#ifndef VIRTIO_CONFIG_NO_LEGACY
+ FEATURE_ENTRY(VIRTIO_F_NOTIFY_ON_EMPTY, \
+ "VIRTIO_F_NOTIFY_ON_EMPTY: Notify when device runs out of avail. "
+ "descs. on VQ"),
+ FEATURE_ENTRY(VIRTIO_F_ANY_LAYOUT, \
+ "VIRTIO_F_ANY_LAYOUT: Device accepts arbitrary desc. layouts"),
+#endif /* !VIRTIO_CONFIG_NO_LEGACY */
+ FEATURE_ENTRY(VIRTIO_F_VERSION_1, \
+ "VIRTIO_F_VERSION_1: Device compliant for v1 spec (legacy)"),
+ FEATURE_ENTRY(VIRTIO_F_IOMMU_PLATFORM, \
+ "VIRTIO_F_IOMMU_PLATFORM: Device can be used on IOMMU platform"),
+ FEATURE_ENTRY(VIRTIO_F_RING_PACKED, \
+ "VIRTIO_F_RING_PACKED: Device supports packed VQ layout"),
+ FEATURE_ENTRY(VIRTIO_F_IN_ORDER, \
+ "VIRTIO_F_IN_ORDER: Device uses buffers in same order as made "
+ "available by driver"),
+ FEATURE_ENTRY(VIRTIO_F_ORDER_PLATFORM, \
+ "VIRTIO_F_ORDER_PLATFORM: Memory accesses ordered by platform"),
+ FEATURE_ENTRY(VIRTIO_F_SR_IOV, \
+ "VIRTIO_F_SR_IOV: Device supports single root I/O virtualization"),
+ /* Virtio ring transport features */
+ FEATURE_ENTRY(VIRTIO_RING_F_INDIRECT_DESC, \
+ "VIRTIO_RING_F_INDIRECT_DESC: Indirect descriptors supported"),
+ FEATURE_ENTRY(VIRTIO_RING_F_EVENT_IDX, \
+ "VIRTIO_RING_F_EVENT_IDX: Used & avail. event fields enabled"),
+ { -1, "" }
+};
+
+/* Vhost-user protocol features mapping */
+static qmp_virtio_feature_map_t vhost_user_protocol_map[] = {
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_MQ, \
+ "VHOST_USER_PROTOCOL_F_MQ: Multiqueue protocol supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_LOG_SHMFD, \
+ "VHOST_USER_PROTOCOL_F_LOG_SHMFD: Shared log memory fd supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_RARP, \
+ "VHOST_USER_PROTOCOL_F_RARP: Vhost-user back-end RARP broadcasting "
+ "supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_REPLY_ACK, \
+ "VHOST_USER_PROTOCOL_F_REPLY_ACK: Requested operation status ack. "
+ "supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_NET_MTU, \
+ "VHOST_USER_PROTOCOL_F_NET_MTU: Expose host MTU to guest supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_SLAVE_REQ, \
+ "VHOST_USER_PROTOCOL_F_SLAVE_REQ: Socket fd for back-end initiated "
+ "requests supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CROSS_ENDIAN, \
+ "VHOST_USER_PROTOCOL_F_CROSS_ENDIAN: Endianness of VQs for legacy "
+ "devices supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CRYPTO_SESSION, \
+ "VHOST_USER_PROTOCOL_F_CRYPTO_SESSION: Session creation for crypto "
+ "operations supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_PAGEFAULT, \
+ "VHOST_USER_PROTOCOL_F_PAGEFAULT: Request servicing on userfaultfd "
+ "for accessed pages supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CONFIG, \
+ "VHOST_USER_PROTOCOL_F_CONFIG: Vhost-user messaging for virtio "
+ "device configuration space supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD, \
+ "VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD: Slave fd communication "
+ "channel supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_HOST_NOTIFIER, \
+ "VHOST_USER_PROTOCOL_F_HOST_NOTIFIER: Host notifiers for specified "
+ "VQs supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD, \
+ "VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD: Shared inflight I/O buffers "
+ "supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_RESET_DEVICE, \
+ "VHOST_USER_PROTOCOL_F_RESET_DEVICE: Disabling all rings and "
+ "resetting internal device state supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS, \
+ "VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS: In-band messaging "
+ "supported"),
+ FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS, \
+ "VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS: Configuration for "
+ "memory slots supported"),
+ { -1, "" }
+};
+
+/* virtio device configuration statuses */
+static qmp_virtio_feature_map_t virtio_config_status_map[] = {
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_DRIVER_OK, \
+ "VIRTIO_CONFIG_S_DRIVER_OK: Driver setup and ready"),
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_FEATURES_OK, \
+ "VIRTIO_CONFIG_S_FEATURES_OK: Feature negotiation complete"),
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_DRIVER, \
+ "VIRTIO_CONFIG_S_DRIVER: Guest OS compatible with device"),
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_NEEDS_RESET, \
+ "VIRTIO_CONFIG_S_NEEDS_RESET: Irrecoverable error, device needs "
+ "reset"),
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_FAILED, \
+ "VIRTIO_CONFIG_S_FAILED: Error in guest, device failed"),
+ FEATURE_ENTRY(VIRTIO_CONFIG_S_ACKNOWLEDGE, \
+ "VIRTIO_CONFIG_S_ACKNOWLEDGE: Valid virtio device found"),
+ { -1, "" }
+};
+
+/* virtio-blk features mapping */
+qmp_virtio_feature_map_t virtio_blk_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_BLK_F_SIZE_MAX, \
+ "VIRTIO_BLK_F_SIZE_MAX: Max segment size is size_max"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_SEG_MAX, \
+ "VIRTIO_BLK_F_SEG_MAX: Max segments in a request is seg_max"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_GEOMETRY, \
+ "VIRTIO_BLK_F_GEOMETRY: Legacy geometry available"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_RO, \
+ "VIRTIO_BLK_F_RO: Device is read-only"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_BLK_SIZE, \
+ "VIRTIO_BLK_F_BLK_SIZE: Block size of disk available"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_TOPOLOGY, \
+ "VIRTIO_BLK_F_TOPOLOGY: Topology information available"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_MQ, \
+ "VIRTIO_BLK_F_MQ: Multiqueue supported"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_DISCARD, \
+ "VIRTIO_BLK_F_DISCARD: Discard command supported"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \
+ "VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"),
+#ifndef VIRTIO_BLK_NO_LEGACY
+ FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \
+ "VIRTIO_BLK_F_BARRIER: Request barriers supported"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_SCSI, \
+ "VIRTIO_BLK_F_SCSI: SCSI packet commands supported"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_FLUSH, \
+ "VIRTIO_BLK_F_FLUSH: Flush command supported"),
+ FEATURE_ENTRY(VIRTIO_BLK_F_CONFIG_WCE, \
+ "VIRTIO_BLK_F_CONFIG_WCE: Cache writeback and writethrough modes "
+ "supported"),
+#endif /* !VIRTIO_BLK_NO_LEGACY */
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio-serial features mapping */
+qmp_virtio_feature_map_t virtio_serial_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_CONSOLE_F_SIZE, \
+ "VIRTIO_CONSOLE_F_SIZE: Host providing console size"),
+ FEATURE_ENTRY(VIRTIO_CONSOLE_F_MULTIPORT, \
+ "VIRTIO_CONSOLE_F_MULTIPORT: Multiple ports for device supported"),
+ FEATURE_ENTRY(VIRTIO_CONSOLE_F_EMERG_WRITE, \
+ "VIRTIO_CONSOLE_F_EMERG_WRITE: Emergency write supported"),
+ { -1, "" }
+};
+
+/* virtio-gpu features mapping */
+qmp_virtio_feature_map_t virtio_gpu_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_GPU_F_VIRGL, \
+ "VIRTIO_GPU_F_VIRGL: Virgl 3D mode supported"),
+ FEATURE_ENTRY(VIRTIO_GPU_F_EDID, \
+ "VIRTIO_GPU_F_EDID: EDID metadata supported"),
+ FEATURE_ENTRY(VIRTIO_GPU_F_RESOURCE_UUID, \
+ "VIRTIO_GPU_F_RESOURCE_UUID: Resource UUID assigning supported"),
+ FEATURE_ENTRY(VIRTIO_GPU_F_RESOURCE_BLOB, \
+ "VIRTIO_GPU_F_RESOURCE_BLOB: Size-based blob resources supported"),
+ FEATURE_ENTRY(VIRTIO_GPU_F_CONTEXT_INIT, \
+ "VIRTIO_GPU_F_CONTEXT_INIT: Context types and synchronization "
+ "timelines supported"),
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio-input features mapping */
+qmp_virtio_feature_map_t virtio_input_feature_map[] = {
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio-net features mapping */
+qmp_virtio_feature_map_t virtio_net_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_NET_F_CSUM, \
+ "VIRTIO_NET_F_CSUM: Device handling packets with partial checksum "
+ "supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_CSUM, \
+ "VIRTIO_NET_F_GUEST_CSUM: Driver handling packets with partial "
+ "checksum supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
+ "VIRTIO_NET_F_CTRL_GUEST_OFFLOADS: Control channel offloading "
+ "reconfig. supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_MTU, \
+ "VIRTIO_NET_F_MTU: Device max MTU reporting supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_MAC, \
+ "VIRTIO_NET_F_MAC: Device has given MAC address"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_TSO4, \
+ "VIRTIO_NET_F_GUEST_TSO4: Driver can receive TSOv4"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_TSO6, \
+ "VIRTIO_NET_F_GUEST_TSO6: Driver can receive TSOv6"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_ECN, \
+ "VIRTIO_NET_F_GUEST_ECN: Driver can receive TSO with ECN"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UFO, \
+ "VIRTIO_NET_F_GUEST_UFO: Driver can receive UFO"),
+ FEATURE_ENTRY(VIRTIO_NET_F_HOST_TSO4, \
+ "VIRTIO_NET_F_HOST_TSO4: Device can receive TSOv4"),
+ FEATURE_ENTRY(VIRTIO_NET_F_HOST_TSO6, \
+ "VIRTIO_NET_F_HOST_TSO6: Device can receive TSOv6"),
+ FEATURE_ENTRY(VIRTIO_NET_F_HOST_ECN, \
+ "VIRTIO_NET_F_HOST_ECN: Device can receive TSO with ECN"),
+ FEATURE_ENTRY(VIRTIO_NET_F_HOST_UFO, \
+ "VIRTIO_NET_F_HOST_UFO: Device can receive UFO"),
+ FEATURE_ENTRY(VIRTIO_NET_F_MRG_RXBUF, \
+ "VIRTIO_NET_F_MRG_RXBUF: Driver can merge receive buffers"),
+ FEATURE_ENTRY(VIRTIO_NET_F_STATUS, \
+ "VIRTIO_NET_F_STATUS: Configuration status field available"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_VQ, \
+ "VIRTIO_NET_F_CTRL_VQ: Control channel available"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_RX, \
+ "VIRTIO_NET_F_CTRL_RX: Control channel RX mode supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_VLAN, \
+ "VIRTIO_NET_F_CTRL_VLAN: Control channel VLAN filtering supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_RX_EXTRA, \
+ "VIRTIO_NET_F_CTRL_RX_EXTRA: Extra RX mode control supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_GUEST_ANNOUNCE, \
+ "VIRTIO_NET_F_GUEST_ANNOUNCE: Driver sending gratuitous packets "
+ "supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_MQ, \
+ "VIRTIO_NET_F_MQ: Multiqueue with automatic receive steering "
+ "supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_CTRL_MAC_ADDR, \
+ "VIRTIO_NET_F_CTRL_MAC_ADDR: MAC address set through control "
+ "channel"),
+ FEATURE_ENTRY(VIRTIO_NET_F_HASH_REPORT, \
+ "VIRTIO_NET_F_HASH_REPORT: Hash reporting supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_RSS, \
+ "VIRTIO_NET_F_RSS: RSS RX steering supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_RSC_EXT, \
+ "VIRTIO_NET_F_RSC_EXT: Extended coalescing info supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_STANDBY, \
+ "VIRTIO_NET_F_STANDBY: Device acting as standby for primary "
+ "device with same MAC addr. supported"),
+ FEATURE_ENTRY(VIRTIO_NET_F_SPEED_DUPLEX, \
+ "VIRTIO_NET_F_SPEED_DUPLEX: Device set linkspeed and duplex"),
+#ifndef VIRTIO_NET_NO_LEGACY
+ FEATURE_ENTRY(VIRTIO_NET_F_GSO, \
+ "VIRTIO_NET_F_GSO: Handling GSO-type packets supported"),
+#endif /* !VIRTIO_NET_NO_LEGACY */
+ FEATURE_ENTRY(VHOST_NET_F_VIRTIO_NET_HDR, \
+ "VHOST_NET_F_VIRTIO_NET_HDR: Virtio-net headers for RX and TX "
+ "packets supported"),
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio-scsi features mapping */
+qmp_virtio_feature_map_t virtio_scsi_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_SCSI_F_INOUT, \
+ "VIRTIO_SCSI_F_INOUT: Requests including read and writable data "
+ "buffers suppoted"),
+ FEATURE_ENTRY(VIRTIO_SCSI_F_HOTPLUG, \
+ "VIRTIO_SCSI_F_HOTPLUG: Reporting and handling hot-plug events "
+ "supported"),
+ FEATURE_ENTRY(VIRTIO_SCSI_F_CHANGE, \
+ "VIRTIO_SCSI_F_CHANGE: Reporting and handling LUN changes "
+ "supported"),
+ FEATURE_ENTRY(VIRTIO_SCSI_F_T10_PI, \
+ "VIRTIO_SCSI_F_T10_PI: T10 info included in request header"),
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio/vhost-user-fs features mapping */
+qmp_virtio_feature_map_t virtio_fs_feature_map[] = {
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio/vhost-user-i2c features mapping */
+qmp_virtio_feature_map_t virtio_i2c_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_I2C_F_ZERO_LENGTH_REQUEST, \
+ "VIRTIO_I2C_F_ZERO_LEGNTH_REQUEST: Zero length requests supported"),
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio/vhost-vsock features mapping */
+qmp_virtio_feature_map_t virtio_vsock_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_VSOCK_F_SEQPACKET, \
+ "VIRTIO_VSOCK_F_SEQPACKET: SOCK_SEQPACKET supported"),
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/* virtio-balloon features mapping */
+qmp_virtio_feature_map_t virtio_balloon_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_MUST_TELL_HOST, \
+ "VIRTIO_BALLOON_F_MUST_TELL_HOST: Tell host before reclaiming "
+ "pages"),
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_STATS_VQ, \
+ "VIRTIO_BALLOON_F_STATS_VQ: Guest memory stats VQ available"),
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_DEFLATE_ON_OOM, \
+ "VIRTIO_BALLOON_F_DEFLATE_ON_OOM: Deflate balloon when guest OOM"),
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_FREE_PAGE_HINT, \
+ "VIRTIO_BALLOON_F_FREE_PAGE_HINT: VQ reporting free pages enabled"),
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_PAGE_POISON, \
+ "VIRTIO_BALLOON_F_PAGE_POISON: Guest page poisoning enabled"),
+ FEATURE_ENTRY(VIRTIO_BALLOON_F_REPORTING, \
+ "VIRTIO_BALLOON_F_REPORTING: Page reporting VQ enabled"),
+ { -1, "" }
+};
+
+/* virtio-crypto features mapping */
+qmp_virtio_feature_map_t virtio_crypto_feature_map[] = {
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ { -1, "" }
+};
+
+/* virtio-iommu features mapping */
+qmp_virtio_feature_map_t virtio_iommu_feature_map[] = {
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_INPUT_RANGE, \
+ "VIRTIO_IOMMU_F_INPUT_RANGE: Range of available virtual addrs. "
+ "available"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_DOMAIN_RANGE, \
+ "VIRTIO_IOMMU_F_DOMAIN_RANGE: Number of supported domains "
+ "available"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_MAP_UNMAP, \
+ "VIRTIO_IOMMU_F_MAP_UNMAP: Map and unmap requests available"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_BYPASS, \
+ "VIRTIO_IOMMU_F_BYPASS: Endpoints not attached to domains are in "
+ "bypass mode"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_PROBE, \
+ "VIRTIO_IOMMU_F_PROBE: Probe requests available"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_MMIO, \
+ "VIRTIO_IOMMU_F_MMIO: VIRTIO_IOMMU_MAP_F_MMIO flag available"),
+ FEATURE_ENTRY(VIRTIO_IOMMU_F_BYPASS_CONFIG, \
+ "VIRTIO_IOMMU_F_BYPASS_CONFIG: Bypass field of IOMMU config "
+ "available"),
+ { -1, "" }
+};
+
+/* virtio-mem features mapping */
+qmp_virtio_feature_map_t virtio_mem_feature_map[] = {
+#ifndef CONFIG_ACPI
+ FEATURE_ENTRY(VIRTIO_MEM_F_ACPI_PXM, \
+ "VIRTIO_MEM_F_ACPI_PXM: node_id is an ACPI PXM and is valid"),
+#endif /* !CONFIG_ACPI */
+ FEATURE_ENTRY(VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, \
+ "VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE: Unplugged memory cannot be "
+ "accessed"),
+ { -1, "" }
+};
+
+/* virtio-rng features mapping */
+qmp_virtio_feature_map_t virtio_rng_feature_map[] = {
+ FEATURE_ENTRY(VHOST_F_LOG_ALL, \
+ "VHOST_F_LOG_ALL: Logging write descriptors supported"),
+ FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \
+ "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features "
+ "negotiation supported"),
+ { -1, "" }
+};
+
+/*
+ * The alignment to use between consumer and producer parts of vring.
+ * x86 pagesize again. This is the default, used by transports like PCI
+ * which don't provide a means for the guest to tell the host the alignment.
+ */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+typedef struct VRingDesc
+{
+ uint64_t addr;
+ uint32_t len;
+ uint16_t flags;
+ uint16_t next;
+} VRingDesc;
+
+typedef struct VRingPackedDesc {
+ uint64_t addr;
+ uint32_t len;
+ uint16_t id;
+ uint16_t flags;
+} VRingPackedDesc;
+
+typedef struct VRingAvail
+{
+ uint16_t flags;
+ uint16_t idx;
+ uint16_t ring[];
+} VRingAvail;
+
+typedef struct VRingUsedElem
+{
+ uint32_t id;
+ uint32_t len;
+} VRingUsedElem;
+
+typedef struct VRingUsed
+{
+ uint16_t flags;
+ uint16_t idx;
+ VRingUsedElem ring[];
+} VRingUsed;
+
+typedef struct VRingMemoryRegionCaches {
+ struct rcu_head rcu;
+ MemoryRegionCache desc;
+ MemoryRegionCache avail;
+ MemoryRegionCache used;
+} VRingMemoryRegionCaches;
+
+typedef struct VRing
+{
+ unsigned int num;
+ unsigned int num_default;
+ unsigned int align;
+ hwaddr desc;
+ hwaddr avail;
+ hwaddr used;
+ VRingMemoryRegionCaches *caches;
+} VRing;
+
+typedef struct VRingPackedDescEvent {
+ uint16_t off_wrap;
+ uint16_t flags;
+} VRingPackedDescEvent ;
+
+struct VirtQueue
+{
+ VRing vring;
+ VirtQueueElement *used_elems;
+
+ /* Next head to pop */
+ uint16_t last_avail_idx;
+ bool last_avail_wrap_counter;
+
+ /* Last avail_idx read from VQ. */
+ uint16_t shadow_avail_idx;
+ bool shadow_avail_wrap_counter;
+
+ uint16_t used_idx;
+ bool used_wrap_counter;
+
+ /* Last used index value we have signalled on */
+ uint16_t signalled_used;
+
+ /* Last used index value we have signalled on */
+ bool signalled_used_valid;
+
+ /* Notification enabled? */
+ bool notification;
+
+ uint16_t queue_index;
+
+ unsigned int inuse;
+
+ uint16_t vector;
+ VirtIOHandleOutput handle_output;
+ VirtIODevice *vdev;
+ EventNotifier guest_notifier;
+ EventNotifier host_notifier;
+ bool host_notifier_enabled;
+ QLIST_ENTRY(VirtQueue) node;
+};
+
+const char *virtio_device_names[] = {
+ [VIRTIO_ID_NET] = "virtio-net",
+ [VIRTIO_ID_BLOCK] = "virtio-blk",
+ [VIRTIO_ID_CONSOLE] = "virtio-serial",
+ [VIRTIO_ID_RNG] = "virtio-rng",
+ [VIRTIO_ID_BALLOON] = "virtio-balloon",
+ [VIRTIO_ID_IOMEM] = "virtio-iomem",
+ [VIRTIO_ID_RPMSG] = "virtio-rpmsg",
+ [VIRTIO_ID_SCSI] = "virtio-scsi",
+ [VIRTIO_ID_9P] = "virtio-9p",
+ [VIRTIO_ID_MAC80211_WLAN] = "virtio-mac-wlan",
+ [VIRTIO_ID_RPROC_SERIAL] = "virtio-rproc-serial",
+ [VIRTIO_ID_CAIF] = "virtio-caif",
+ [VIRTIO_ID_MEMORY_BALLOON] = "virtio-mem-balloon",
+ [VIRTIO_ID_GPU] = "virtio-gpu",
+ [VIRTIO_ID_CLOCK] = "virtio-clk",
+ [VIRTIO_ID_INPUT] = "virtio-input",
+ [VIRTIO_ID_VSOCK] = "vhost-vsock",
+ [VIRTIO_ID_CRYPTO] = "virtio-crypto",
+ [VIRTIO_ID_SIGNAL_DIST] = "virtio-signal",
+ [VIRTIO_ID_PSTORE] = "virtio-pstore",
+ [VIRTIO_ID_IOMMU] = "virtio-iommu",
+ [VIRTIO_ID_MEM] = "virtio-mem",
+ [VIRTIO_ID_SOUND] = "virtio-sound",
+ [VIRTIO_ID_FS] = "virtio-user-fs",
+ [VIRTIO_ID_PMEM] = "virtio-pmem",
+ [VIRTIO_ID_RPMB] = "virtio-rpmb",
+ [VIRTIO_ID_MAC80211_HWSIM] = "virtio-mac-hwsim",
+ [VIRTIO_ID_VIDEO_ENCODER] = "virtio-vid-encoder",
+ [VIRTIO_ID_VIDEO_DECODER] = "virtio-vid-decoder",
+ [VIRTIO_ID_SCMI] = "virtio-scmi",
+ [VIRTIO_ID_NITRO_SEC_MOD] = "virtio-nitro-sec-mod",
+ [VIRTIO_ID_I2C_ADAPTER] = "vhost-user-i2c",
+ [VIRTIO_ID_WATCHDOG] = "virtio-watchdog",
+ [VIRTIO_ID_CAN] = "virtio-can",
+ [VIRTIO_ID_DMABUF] = "virtio-dmabuf",
+ [VIRTIO_ID_PARAM_SERV] = "virtio-param-serv",
+ [VIRTIO_ID_AUDIO_POLICY] = "virtio-audio-pol",
+ [VIRTIO_ID_BT] = "virtio-bluetooth",
+ [VIRTIO_ID_GPIO] = "virtio-gpio"
+};
+
+static const char *virtio_id_to_name(uint16_t device_id)
+{
+ assert(device_id < G_N_ELEMENTS(virtio_device_names));
+ const char *name = virtio_device_names[device_id];
+ assert(name != NULL);
+ return name;
+}
+
+/* Called within call_rcu(). */
+static void virtio_free_region_cache(VRingMemoryRegionCaches *caches)
+{
+ assert(caches != NULL);
+ address_space_cache_destroy(&caches->desc);
+ address_space_cache_destroy(&caches->avail);
+ address_space_cache_destroy(&caches->used);
+ g_free(caches);
+}
+
+static void virtio_virtqueue_reset_region_cache(struct VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches;
+
+ caches = qatomic_read(&vq->vring.caches);
+ qatomic_rcu_set(&vq->vring.caches, NULL);
+ if (caches) {
+ call_rcu(caches, virtio_free_region_cache, rcu);
+ }
+}
+
+static void virtio_init_region_cache(VirtIODevice *vdev, int n)
+{
+ VirtQueue *vq = &vdev->vq[n];
+ VRingMemoryRegionCaches *old = vq->vring.caches;
+ VRingMemoryRegionCaches *new = NULL;
+ hwaddr addr, size;
+ int64_t len;
+ bool packed;
+
+
+ addr = vq->vring.desc;
+ if (!addr) {
+ goto out_no_cache;
+ }
+ new = g_new0(VRingMemoryRegionCaches, 1);
+ size = virtio_queue_get_desc_size(vdev, n);
+ packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ?
+ true : false;
+ len = address_space_cache_init(&new->desc, vdev->dma_as,
+ addr, size, packed);
+ if (len < size) {
+ virtio_error(vdev, "Cannot map desc");
+ goto err_desc;
+ }
+
+ size = virtio_queue_get_used_size(vdev, n);
+ len = address_space_cache_init(&new->used, vdev->dma_as,
+ vq->vring.used, size, true);
+ if (len < size) {
+ virtio_error(vdev, "Cannot map used");
+ goto err_used;
+ }
+
+ size = virtio_queue_get_avail_size(vdev, n);
+ len = address_space_cache_init(&new->avail, vdev->dma_as,
+ vq->vring.avail, size, false);
+ if (len < size) {
+ virtio_error(vdev, "Cannot map avail");
+ goto err_avail;
+ }
+
+ qatomic_rcu_set(&vq->vring.caches, new);
+ if (old) {
+ call_rcu(old, virtio_free_region_cache, rcu);
+ }
+ return;
+
+err_avail:
+ address_space_cache_destroy(&new->avail);
+err_used:
+ address_space_cache_destroy(&new->used);
+err_desc:
+ address_space_cache_destroy(&new->desc);
+out_no_cache:
+ g_free(new);
+ virtio_virtqueue_reset_region_cache(vq);
+}
+
+/* virt queue functions */
+void virtio_queue_update_rings(VirtIODevice *vdev, int n)
+{
+ VRing *vring = &vdev->vq[n].vring;
+
+ if (!vring->num || !vring->desc || !vring->align) {
+ /* not yet setup -> nothing to do */
+ return;
+ }
+ vring->avail = vring->desc + vring->num * sizeof(VRingDesc);
+ vring->used = vring_align(vring->avail +
+ offsetof(VRingAvail, ring[vring->num]),
+ vring->align);
+ virtio_init_region_cache(vdev, n);
+}
+
+/* Called within rcu_read_lock(). */
+static void vring_split_desc_read(VirtIODevice *vdev, VRingDesc *desc,
+ MemoryRegionCache *cache, int i)
+{
+ address_space_read_cached(cache, i * sizeof(VRingDesc),
+ desc, sizeof(VRingDesc));
+ virtio_tswap64s(vdev, &desc->addr);
+ virtio_tswap32s(vdev, &desc->len);
+ virtio_tswap16s(vdev, &desc->flags);
+ virtio_tswap16s(vdev, &desc->next);
+}
+
+static void vring_packed_event_read(VirtIODevice *vdev,
+ MemoryRegionCache *cache,
+ VRingPackedDescEvent *e)
+{
+ hwaddr off_off = offsetof(VRingPackedDescEvent, off_wrap);
+ hwaddr off_flags = offsetof(VRingPackedDescEvent, flags);
+
+ e->flags = virtio_lduw_phys_cached(vdev, cache, off_flags);
+ /* Make sure flags is seen before off_wrap */
+ smp_rmb();
+ e->off_wrap = virtio_lduw_phys_cached(vdev, cache, off_off);
+ virtio_tswap16s(vdev, &e->flags);
+}
+
+static void vring_packed_off_wrap_write(VirtIODevice *vdev,
+ MemoryRegionCache *cache,
+ uint16_t off_wrap)
+{
+ hwaddr off = offsetof(VRingPackedDescEvent, off_wrap);
+
+ virtio_stw_phys_cached(vdev, cache, off, off_wrap);
+ address_space_cache_invalidate(cache, off, sizeof(off_wrap));
+}
+
+static void vring_packed_flags_write(VirtIODevice *vdev,
+ MemoryRegionCache *cache, uint16_t flags)
+{
+ hwaddr off = offsetof(VRingPackedDescEvent, flags);
+
+ virtio_stw_phys_cached(vdev, cache, off, flags);
+ address_space_cache_invalidate(cache, off, sizeof(flags));
+}
+
+/* Called within rcu_read_lock(). */
+static VRingMemoryRegionCaches *vring_get_region_caches(struct VirtQueue *vq)
+{
+ return qatomic_rcu_read(&vq->vring.caches);
+}
+
+/* Called within rcu_read_lock(). */
+static inline uint16_t vring_avail_flags(VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingAvail, flags);
+
+ if (!caches) {
+ return 0;
+ }
+
+ return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
+}
+
+/* Called within rcu_read_lock(). */
+static inline uint16_t vring_avail_idx(VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingAvail, idx);
+
+ if (!caches) {
+ return 0;
+ }
+
+ vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
+ return vq->shadow_avail_idx;
+}
+
+/* Called within rcu_read_lock(). */
+static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingAvail, ring[i]);
+
+ if (!caches) {
+ return 0;
+ }
+
+ return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa);
+}
+
+/* Called within rcu_read_lock(). */
+static inline uint16_t vring_get_used_event(VirtQueue *vq)
+{
+ return vring_avail_ring(vq, vq->vring.num);
+}
+
+/* Called within rcu_read_lock(). */
+static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem,
+ int i)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingUsed, ring[i]);
+
+ if (!caches) {
+ return;
+ }
+
+ virtio_tswap32s(vq->vdev, &uelem->id);
+ virtio_tswap32s(vq->vdev, &uelem->len);
+ address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem));
+ address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem));
+}
+
+/* Called within rcu_read_lock(). */
+static inline uint16_t vring_used_flags(VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingUsed, flags);
+
+ if (!caches) {
+ return 0;
+ }
+
+ return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+}
+
+/* Called within rcu_read_lock(). */
+static uint16_t vring_used_idx(VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingUsed, idx);
+
+ if (!caches) {
+ return 0;
+ }
+
+ return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+}
+
+/* Called within rcu_read_lock(). */
+static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ hwaddr pa = offsetof(VRingUsed, idx);
+
+ if (caches) {
+ virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
+ address_space_cache_invalidate(&caches->used, pa, sizeof(val));
+ }
+
+ vq->used_idx = val;
+}
+
+/* Called within rcu_read_lock(). */
+static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ VirtIODevice *vdev = vq->vdev;
+ hwaddr pa = offsetof(VRingUsed, flags);
+ uint16_t flags;
+
+ if (!caches) {
+ return;
+ }
+
+ flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+ virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask);
+ address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
+}
+
+/* Called within rcu_read_lock(). */
+static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask)
+{
+ VRingMemoryRegionCaches *caches = vring_get_region_caches(vq);
+ VirtIODevice *vdev = vq->vdev;
+ hwaddr pa = offsetof(VRingUsed, flags);
+ uint16_t flags;
+
+ if (!caches) {
+ return;
+ }
+
+ flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa);
+ virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask);
+ address_space_cache_invalidate(&caches->used, pa, sizeof(flags));
+}
+
+/* Called within rcu_read_lock(). */
+static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val)
+{
+ VRingMemoryRegionCaches *caches;
+ hwaddr pa;
+ if (!vq->notification) {
+ return;
+ }
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
+ pa = offsetof(VRingUsed, ring[vq->vring.num]);
+ virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val);
+ address_space_cache_invalidate(&caches->used, pa, sizeof(val));
+}
+
+static void virtio_queue_split_set_notification(VirtQueue *vq, int enable)
+{
+ RCU_READ_LOCK_GUARD();
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ vring_set_avail_event(vq, vring_avail_idx(vq));
+ } else if (enable) {
+ vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
+ } else {
+ vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
+ }
+ if (enable) {
+ /* Expose avail event/used flags before caller checks the avail idx. */
+ smp_mb();
+ }
+}
+
+static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable)
+{
+ uint16_t off_wrap;
+ VRingPackedDescEvent e;
+ VRingMemoryRegionCaches *caches;
+
+ RCU_READ_LOCK_GUARD();
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
+ vring_packed_event_read(vq->vdev, &caches->used, &e);
+
+ if (!enable) {
+ e.flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+ } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ off_wrap = vq->shadow_avail_idx | vq->shadow_avail_wrap_counter << 15;
+ vring_packed_off_wrap_write(vq->vdev, &caches->used, off_wrap);
+ /* Make sure off_wrap is wrote before flags */
+ smp_wmb();
+ e.flags = VRING_PACKED_EVENT_FLAG_DESC;
+ } else {
+ e.flags = VRING_PACKED_EVENT_FLAG_ENABLE;
+ }
+
+ vring_packed_flags_write(vq->vdev, &caches->used, e.flags);
+ if (enable) {
+ /* Expose avail event/used flags before caller checks the avail idx. */
+ smp_mb();
+ }
+}
+
+bool virtio_queue_get_notification(VirtQueue *vq)
+{
+ return vq->notification;
+}
+
+void virtio_queue_set_notification(VirtQueue *vq, int enable)
+{
+ vq->notification = enable;
+
+ if (!vq->vring.desc) {
+ return;
+ }
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtio_queue_packed_set_notification(vq, enable);
+ } else {
+ virtio_queue_split_set_notification(vq, enable);
+ }
+}
+
+int virtio_queue_ready(VirtQueue *vq)
+{
+ return vq->vring.avail != 0;
+}
+
+static void vring_packed_desc_read_flags(VirtIODevice *vdev,
+ uint16_t *flags,
+ MemoryRegionCache *cache,
+ int i)
+{
+ hwaddr off = i * sizeof(VRingPackedDesc) + offsetof(VRingPackedDesc, flags);
+
+ *flags = virtio_lduw_phys_cached(vdev, cache, off);
+}
+
+static void vring_packed_desc_read(VirtIODevice *vdev,
+ VRingPackedDesc *desc,
+ MemoryRegionCache *cache,
+ int i, bool strict_order)
+{
+ hwaddr off = i * sizeof(VRingPackedDesc);
+
+ vring_packed_desc_read_flags(vdev, &desc->flags, cache, i);
+
+ if (strict_order) {
+ /* Make sure flags is read before the rest fields. */
+ smp_rmb();
+ }
+
+ address_space_read_cached(cache, off + offsetof(VRingPackedDesc, addr),
+ &desc->addr, sizeof(desc->addr));
+ address_space_read_cached(cache, off + offsetof(VRingPackedDesc, id),
+ &desc->id, sizeof(desc->id));
+ address_space_read_cached(cache, off + offsetof(VRingPackedDesc, len),
+ &desc->len, sizeof(desc->len));
+ virtio_tswap64s(vdev, &desc->addr);
+ virtio_tswap16s(vdev, &desc->id);
+ virtio_tswap32s(vdev, &desc->len);
+}
+
+static void vring_packed_desc_write_data(VirtIODevice *vdev,
+ VRingPackedDesc *desc,
+ MemoryRegionCache *cache,
+ int i)
+{
+ hwaddr off_id = i * sizeof(VRingPackedDesc) +
+ offsetof(VRingPackedDesc, id);
+ hwaddr off_len = i * sizeof(VRingPackedDesc) +
+ offsetof(VRingPackedDesc, len);
+
+ virtio_tswap32s(vdev, &desc->len);
+ virtio_tswap16s(vdev, &desc->id);
+ address_space_write_cached(cache, off_id, &desc->id, sizeof(desc->id));
+ address_space_cache_invalidate(cache, off_id, sizeof(desc->id));
+ address_space_write_cached(cache, off_len, &desc->len, sizeof(desc->len));
+ address_space_cache_invalidate(cache, off_len, sizeof(desc->len));
+}
+
+static void vring_packed_desc_write_flags(VirtIODevice *vdev,
+ VRingPackedDesc *desc,
+ MemoryRegionCache *cache,
+ int i)
+{
+ hwaddr off = i * sizeof(VRingPackedDesc) + offsetof(VRingPackedDesc, flags);
+
+ virtio_stw_phys_cached(vdev, cache, off, desc->flags);
+ address_space_cache_invalidate(cache, off, sizeof(desc->flags));
+}
+
+static void vring_packed_desc_write(VirtIODevice *vdev,
+ VRingPackedDesc *desc,
+ MemoryRegionCache *cache,
+ int i, bool strict_order)
+{
+ vring_packed_desc_write_data(vdev, desc, cache, i);
+ if (strict_order) {
+ /* Make sure data is wrote before flags. */
+ smp_wmb();
+ }
+ vring_packed_desc_write_flags(vdev, desc, cache, i);
+}
+
+static inline bool is_desc_avail(uint16_t flags, bool wrap_counter)
+{
+ bool avail, used;
+
+ avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL));
+ used = !!(flags & (1 << VRING_PACKED_DESC_F_USED));
+ return (avail != used) && (avail == wrap_counter);
+}
+
+/* Fetch avail_idx from VQ memory only when we really need to know if
+ * guest has added some buffers.
+ * Called within rcu_read_lock(). */
+static int virtio_queue_empty_rcu(VirtQueue *vq)
+{
+ if (virtio_device_disabled(vq->vdev)) {
+ return 1;
+ }
+
+ if (unlikely(!vq->vring.avail)) {
+ return 1;
+ }
+
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
+ return 0;
+ }
+
+ return vring_avail_idx(vq) == vq->last_avail_idx;
+}
+
+static int virtio_queue_split_empty(VirtQueue *vq)
+{
+ bool empty;
+
+ if (virtio_device_disabled(vq->vdev)) {
+ return 1;
+ }
+
+ if (unlikely(!vq->vring.avail)) {
+ return 1;
+ }
+
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
+ return 0;
+ }
+
+ RCU_READ_LOCK_GUARD();
+ empty = vring_avail_idx(vq) == vq->last_avail_idx;
+ return empty;
+}
+
+/* Called within rcu_read_lock(). */
+static int virtio_queue_packed_empty_rcu(VirtQueue *vq)
+{
+ struct VRingPackedDesc desc;
+ VRingMemoryRegionCaches *cache;
+
+ if (unlikely(!vq->vring.desc)) {
+ return 1;
+ }
+
+ cache = vring_get_region_caches(vq);
+ if (!cache) {
+ return 1;
+ }
+
+ vring_packed_desc_read_flags(vq->vdev, &desc.flags, &cache->desc,
+ vq->last_avail_idx);
+
+ return !is_desc_avail(desc.flags, vq->last_avail_wrap_counter);
+}
+
+static int virtio_queue_packed_empty(VirtQueue *vq)
+{
+ RCU_READ_LOCK_GUARD();
+ return virtio_queue_packed_empty_rcu(vq);
+}
+
+int virtio_queue_empty(VirtQueue *vq)
+{
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ return virtio_queue_packed_empty(vq);
+ } else {
+ return virtio_queue_split_empty(vq);
+ }
+}
+
+static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+ AddressSpace *dma_as = vq->vdev->dma_as;
+ unsigned int offset;
+ int i;
+
+ offset = 0;
+ for (i = 0; i < elem->in_num; i++) {
+ size_t size = MIN(len - offset, elem->in_sg[i].iov_len);
+
+ dma_memory_unmap(dma_as, elem->in_sg[i].iov_base,
+ elem->in_sg[i].iov_len,
+ DMA_DIRECTION_FROM_DEVICE, size);
+
+ offset += size;
+ }
+
+ for (i = 0; i < elem->out_num; i++)
+ dma_memory_unmap(dma_as, elem->out_sg[i].iov_base,
+ elem->out_sg[i].iov_len,
+ DMA_DIRECTION_TO_DEVICE,
+ elem->out_sg[i].iov_len);
+}
+
+/* virtqueue_detach_element:
+ * @vq: The #VirtQueue
+ * @elem: The #VirtQueueElement
+ * @len: number of bytes written
+ *
+ * Detach the element from the virtqueue. This function is suitable for device
+ * reset or other situations where a #VirtQueueElement is simply freed and will
+ * not be pushed or discarded.
+ */
+void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+ vq->inuse -= elem->ndescs;
+ virtqueue_unmap_sg(vq, elem, len);
+}
+
+static void virtqueue_split_rewind(VirtQueue *vq, unsigned int num)
+{
+ vq->last_avail_idx -= num;
+}
+
+static void virtqueue_packed_rewind(VirtQueue *vq, unsigned int num)
+{
+ if (vq->last_avail_idx < num) {
+ vq->last_avail_idx = vq->vring.num + vq->last_avail_idx - num;
+ vq->last_avail_wrap_counter ^= 1;
+ } else {
+ vq->last_avail_idx -= num;
+ }
+}
+
+/* virtqueue_unpop:
+ * @vq: The #VirtQueue
+ * @elem: The #VirtQueueElement
+ * @len: number of bytes written
+ *
+ * Pretend the most recent element wasn't popped from the virtqueue. The next
+ * call to virtqueue_pop() will refetch the element.
+ */
+void virtqueue_unpop(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtqueue_packed_rewind(vq, 1);
+ } else {
+ virtqueue_split_rewind(vq, 1);
+ }
+
+ virtqueue_detach_element(vq, elem, len);
+}
+
+/* virtqueue_rewind:
+ * @vq: The #VirtQueue
+ * @num: Number of elements to push back
+ *
+ * Pretend that elements weren't popped from the virtqueue. The next
+ * virtqueue_pop() will refetch the oldest element.
+ *
+ * Use virtqueue_unpop() instead if you have a VirtQueueElement.
+ *
+ * Returns: true on success, false if @num is greater than the number of in use
+ * elements.
+ */
+bool virtqueue_rewind(VirtQueue *vq, unsigned int num)
+{
+ if (num > vq->inuse) {
+ return false;
+ }
+
+ vq->inuse -= num;
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtqueue_packed_rewind(vq, num);
+ } else {
+ virtqueue_split_rewind(vq, num);
+ }
+ return true;
+}
+
+static void virtqueue_split_fill(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len, unsigned int idx)
+{
+ VRingUsedElem uelem;
+
+ if (unlikely(!vq->vring.used)) {
+ return;
+ }
+
+ idx = (idx + vq->used_idx) % vq->vring.num;
+
+ uelem.id = elem->index;
+ uelem.len = len;
+ vring_used_write(vq, &uelem, idx);
+}
+
+static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len, unsigned int idx)
+{
+ vq->used_elems[idx].index = elem->index;
+ vq->used_elems[idx].len = len;
+ vq->used_elems[idx].ndescs = elem->ndescs;
+}
+
+static void virtqueue_packed_fill_desc(VirtQueue *vq,
+ const VirtQueueElement *elem,
+ unsigned int idx,
+ bool strict_order)
+{
+ uint16_t head;
+ VRingMemoryRegionCaches *caches;
+ VRingPackedDesc desc = {
+ .id = elem->index,
+ .len = elem->len,
+ };
+ bool wrap_counter = vq->used_wrap_counter;
+
+ if (unlikely(!vq->vring.desc)) {
+ return;
+ }
+
+ head = vq->used_idx + idx;
+ if (head >= vq->vring.num) {
+ head -= vq->vring.num;
+ wrap_counter ^= 1;
+ }
+ if (wrap_counter) {
+ desc.flags |= (1 << VRING_PACKED_DESC_F_AVAIL);
+ desc.flags |= (1 << VRING_PACKED_DESC_F_USED);
+ } else {
+ desc.flags &= ~(1 << VRING_PACKED_DESC_F_AVAIL);
+ desc.flags &= ~(1 << VRING_PACKED_DESC_F_USED);
+ }
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return;
+ }
+
+ vring_packed_desc_write(vq->vdev, &desc, &caches->desc, head, strict_order);
+}
+
+/* Called within rcu_read_lock(). */
+void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len, unsigned int idx)
+{
+ trace_virtqueue_fill(vq, elem, len, idx);
+
+ virtqueue_unmap_sg(vq, elem, len);
+
+ if (virtio_device_disabled(vq->vdev)) {
+ return;
+ }
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtqueue_packed_fill(vq, elem, len, idx);
+ } else {
+ virtqueue_split_fill(vq, elem, len, idx);
+ }
+}
+
+/* Called within rcu_read_lock(). */
+static void virtqueue_split_flush(VirtQueue *vq, unsigned int count)
+{
+ uint16_t old, new;
+
+ if (unlikely(!vq->vring.used)) {
+ return;
+ }
+
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+ trace_virtqueue_flush(vq, count);
+ old = vq->used_idx;
+ new = old + count;
+ vring_used_idx_set(vq, new);
+ vq->inuse -= count;
+ if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)))
+ vq->signalled_used_valid = false;
+}
+
+static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)
+{
+ unsigned int i, ndescs = 0;
+
+ if (unlikely(!vq->vring.desc)) {
+ return;
+ }
+
+ for (i = 1; i < count; i++) {
+ virtqueue_packed_fill_desc(vq, &vq->used_elems[i], i, false);
+ ndescs += vq->used_elems[i].ndescs;
+ }
+ virtqueue_packed_fill_desc(vq, &vq->used_elems[0], 0, true);
+ ndescs += vq->used_elems[0].ndescs;
+
+ vq->inuse -= ndescs;
+ vq->used_idx += ndescs;
+ if (vq->used_idx >= vq->vring.num) {
+ vq->used_idx -= vq->vring.num;
+ vq->used_wrap_counter ^= 1;
+ vq->signalled_used_valid = false;
+ }
+}
+
+void virtqueue_flush(VirtQueue *vq, unsigned int count)
+{
+ if (virtio_device_disabled(vq->vdev)) {
+ vq->inuse -= count;
+ return;
+ }
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtqueue_packed_flush(vq, count);
+ } else {
+ virtqueue_split_flush(vq, count);
+ }
+}
+
+void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+ RCU_READ_LOCK_GUARD();
+ virtqueue_fill(vq, elem, len, 0);
+ virtqueue_flush(vq, 1);
+}
+
+/* Called within rcu_read_lock(). */
+static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
+{
+ uint16_t num_heads = vring_avail_idx(vq) - idx;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ if (num_heads > vq->vring.num) {
+ virtio_error(vq->vdev, "Guest moved used index from %u to %u",
+ idx, vq->shadow_avail_idx);
+ return -EINVAL;
+ }
+ /* On success, callers read a descriptor at vq->last_avail_idx.
+ * Make sure descriptor read does not bypass avail index read. */
+ if (num_heads) {
+ smp_rmb();
+ }
+
+ return num_heads;
+}
+
+/* Called within rcu_read_lock(). */
+static bool virtqueue_get_head(VirtQueue *vq, unsigned int idx,
+ unsigned int *head)
+{
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ *head = vring_avail_ring(vq, idx % vq->vring.num);
+
+ /* If their number is silly, that's a fatal mistake. */
+ if (*head >= vq->vring.num) {
+ virtio_error(vq->vdev, "Guest says index %u is available", *head);
+ return false;
+ }
+
+ return true;
+}
+
+enum {
+ VIRTQUEUE_READ_DESC_ERROR = -1,
+ VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */
+ VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */
+};
+
+static int virtqueue_split_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
+ MemoryRegionCache *desc_cache,
+ unsigned int max, unsigned int *next)
+{
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!(desc->flags & VRING_DESC_F_NEXT)) {
+ return VIRTQUEUE_READ_DESC_DONE;
+ }
+
+ /* Check they're not leading us off end of descriptors. */
+ *next = desc->next;
+ /* Make sure compiler knows to grab that: we don't want it changing! */
+ smp_wmb();
+
+ if (*next >= max) {
+ virtio_error(vdev, "Desc next is %u", *next);
+ return VIRTQUEUE_READ_DESC_ERROR;
+ }
+
+ vring_split_desc_read(vdev, desc, desc_cache, *next);
+ return VIRTQUEUE_READ_DESC_MORE;
+}
+
+/* Called within rcu_read_lock(). */
+static void virtqueue_split_get_avail_bytes(VirtQueue *vq,
+ unsigned int *in_bytes, unsigned int *out_bytes,
+ unsigned max_in_bytes, unsigned max_out_bytes,
+ VRingMemoryRegionCaches *caches)
+{
+ VirtIODevice *vdev = vq->vdev;
+ unsigned int max, idx;
+ unsigned int total_bufs, in_total, out_total;
+ MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+ int64_t len = 0;
+ int rc;
+
+ idx = vq->last_avail_idx;
+ total_bufs = in_total = out_total = 0;
+
+ max = vq->vring.num;
+
+ while ((rc = virtqueue_num_heads(vq, idx)) > 0) {
+ MemoryRegionCache *desc_cache = &caches->desc;
+ unsigned int num_bufs;
+ VRingDesc desc;
+ unsigned int i;
+
+ num_bufs = total_bufs;
+
+ if (!virtqueue_get_head(vq, idx++, &i)) {
+ goto err;
+ }
+
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (!desc.len || (desc.len % sizeof(VRingDesc))) {
+ virtio_error(vdev, "Invalid size for indirect buffer table");
+ goto err;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (num_bufs >= max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err;
+ }
+
+ /* loop over the indirect descriptor table */
+ len = address_space_cache_init(&indirect_desc_cache,
+ vdev->dma_as,
+ desc.addr, desc.len, false);
+ desc_cache = &indirect_desc_cache;
+ if (len < desc.len) {
+ virtio_error(vdev, "Cannot map indirect buffer");
+ goto err;
+ }
+
+ max = desc.len / sizeof(VRingDesc);
+ num_bufs = i = 0;
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+ }
+
+ do {
+ /* If we've got too many, that implies a descriptor loop. */
+ if (++num_bufs > max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err;
+ }
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ in_total += desc.len;
+ } else {
+ out_total += desc.len;
+ }
+ if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
+ goto done;
+ }
+
+ rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, &i);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+ goto err;
+ }
+
+ if (desc_cache == &indirect_desc_cache) {
+ address_space_cache_destroy(&indirect_desc_cache);
+ total_bufs++;
+ } else {
+ total_bufs = num_bufs;
+ }
+ }
+
+ if (rc < 0) {
+ goto err;
+ }
+
+done:
+ address_space_cache_destroy(&indirect_desc_cache);
+ if (in_bytes) {
+ *in_bytes = in_total;
+ }
+ if (out_bytes) {
+ *out_bytes = out_total;
+ }
+ return;
+
+err:
+ in_total = out_total = 0;
+ goto done;
+}
+
+static int virtqueue_packed_read_next_desc(VirtQueue *vq,
+ VRingPackedDesc *desc,
+ MemoryRegionCache
+ *desc_cache,
+ unsigned int max,
+ unsigned int *next,
+ bool indirect)
+{
+ /* If this descriptor says it doesn't chain, we're done. */
+ if (!indirect && !(desc->flags & VRING_DESC_F_NEXT)) {
+ return VIRTQUEUE_READ_DESC_DONE;
+ }
+
+ ++*next;
+ if (*next == max) {
+ if (indirect) {
+ return VIRTQUEUE_READ_DESC_DONE;
+ } else {
+ (*next) -= vq->vring.num;
+ }
+ }
+
+ vring_packed_desc_read(vq->vdev, desc, desc_cache, *next, false);
+ return VIRTQUEUE_READ_DESC_MORE;
+}
+
+/* Called within rcu_read_lock(). */
+static void virtqueue_packed_get_avail_bytes(VirtQueue *vq,
+ unsigned int *in_bytes,
+ unsigned int *out_bytes,
+ unsigned max_in_bytes,
+ unsigned max_out_bytes,
+ VRingMemoryRegionCaches *caches)
+{
+ VirtIODevice *vdev = vq->vdev;
+ unsigned int max, idx;
+ unsigned int total_bufs, in_total, out_total;
+ MemoryRegionCache *desc_cache;
+ MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+ int64_t len = 0;
+ VRingPackedDesc desc;
+ bool wrap_counter;
+
+ idx = vq->last_avail_idx;
+ wrap_counter = vq->last_avail_wrap_counter;
+ total_bufs = in_total = out_total = 0;
+
+ max = vq->vring.num;
+
+ for (;;) {
+ unsigned int num_bufs = total_bufs;
+ unsigned int i = idx;
+ int rc;
+
+ desc_cache = &caches->desc;
+ vring_packed_desc_read(vdev, &desc, desc_cache, idx, true);
+ if (!is_desc_avail(desc.flags, wrap_counter)) {
+ break;
+ }
+
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (desc.len % sizeof(VRingPackedDesc)) {
+ virtio_error(vdev, "Invalid size for indirect buffer table");
+ goto err;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (num_bufs >= max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err;
+ }
+
+ /* loop over the indirect descriptor table */
+ len = address_space_cache_init(&indirect_desc_cache,
+ vdev->dma_as,
+ desc.addr, desc.len, false);
+ desc_cache = &indirect_desc_cache;
+ if (len < desc.len) {
+ virtio_error(vdev, "Cannot map indirect buffer");
+ goto err;
+ }
+
+ max = desc.len / sizeof(VRingPackedDesc);
+ num_bufs = i = 0;
+ vring_packed_desc_read(vdev, &desc, desc_cache, i, false);
+ }
+
+ do {
+ /* If we've got too many, that implies a descriptor loop. */
+ if (++num_bufs > max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err;
+ }
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ in_total += desc.len;
+ } else {
+ out_total += desc.len;
+ }
+ if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
+ goto done;
+ }
+
+ rc = virtqueue_packed_read_next_desc(vq, &desc, desc_cache, max,
+ &i, desc_cache ==
+ &indirect_desc_cache);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ if (desc_cache == &indirect_desc_cache) {
+ address_space_cache_destroy(&indirect_desc_cache);
+ total_bufs++;
+ idx++;
+ } else {
+ idx += num_bufs - total_bufs;
+ total_bufs = num_bufs;
+ }
+
+ if (idx >= vq->vring.num) {
+ idx -= vq->vring.num;
+ wrap_counter ^= 1;
+ }
+ }
+
+ /* Record the index and wrap counter for a kick we want */
+ vq->shadow_avail_idx = idx;
+ vq->shadow_avail_wrap_counter = wrap_counter;
+done:
+ address_space_cache_destroy(&indirect_desc_cache);
+ if (in_bytes) {
+ *in_bytes = in_total;
+ }
+ if (out_bytes) {
+ *out_bytes = out_total;
+ }
+ return;
+
+err:
+ in_total = out_total = 0;
+ goto done;
+}
+
+void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
+ unsigned int *out_bytes,
+ unsigned max_in_bytes, unsigned max_out_bytes)
+{
+ uint16_t desc_size;
+ VRingMemoryRegionCaches *caches;
+
+ RCU_READ_LOCK_GUARD();
+
+ if (unlikely(!vq->vring.desc)) {
+ goto err;
+ }
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ goto err;
+ }
+
+ desc_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ?
+ sizeof(VRingPackedDesc) : sizeof(VRingDesc);
+ if (caches->desc.len < vq->vring.num * desc_size) {
+ virtio_error(vq->vdev, "Cannot map descriptor ring");
+ goto err;
+ }
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ virtqueue_packed_get_avail_bytes(vq, in_bytes, out_bytes,
+ max_in_bytes, max_out_bytes,
+ caches);
+ } else {
+ virtqueue_split_get_avail_bytes(vq, in_bytes, out_bytes,
+ max_in_bytes, max_out_bytes,
+ caches);
+ }
+
+ return;
+err:
+ if (in_bytes) {
+ *in_bytes = 0;
+ }
+ if (out_bytes) {
+ *out_bytes = 0;
+ }
+}
+
+int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
+ unsigned int out_bytes)
+{
+ unsigned int in_total, out_total;
+
+ virtqueue_get_avail_bytes(vq, &in_total, &out_total, in_bytes, out_bytes);
+ return in_bytes <= in_total && out_bytes <= out_total;
+}
+
+static bool virtqueue_map_desc(VirtIODevice *vdev, unsigned int *p_num_sg,
+ hwaddr *addr, struct iovec *iov,
+ unsigned int max_num_sg, bool is_write,
+ hwaddr pa, size_t sz)
+{
+ bool ok = false;
+ unsigned num_sg = *p_num_sg;
+ assert(num_sg <= max_num_sg);
+
+ if (!sz) {
+ virtio_error(vdev, "virtio: zero sized buffers are not allowed");
+ goto out;
+ }
+
+ while (sz) {
+ hwaddr len = sz;
+
+ if (num_sg == max_num_sg) {
+ virtio_error(vdev, "virtio: too many write descriptors in "
+ "indirect table");
+ goto out;
+ }
+
+ iov[num_sg].iov_base = dma_memory_map(vdev->dma_as, pa, &len,
+ is_write ?
+ DMA_DIRECTION_FROM_DEVICE :
+ DMA_DIRECTION_TO_DEVICE,
+ MEMTXATTRS_UNSPECIFIED);
+ if (!iov[num_sg].iov_base) {
+ virtio_error(vdev, "virtio: bogus descriptor or out of resources");
+ goto out;
+ }
+
+ iov[num_sg].iov_len = len;
+ addr[num_sg] = pa;
+
+ sz -= len;
+ pa += len;
+ num_sg++;
+ }
+ ok = true;
+
+out:
+ *p_num_sg = num_sg;
+ return ok;
+}
+
+/* Only used by error code paths before we have a VirtQueueElement (therefore
+ * virtqueue_unmap_sg() can't be used). Assumes buffers weren't written to
+ * yet.
+ */
+static void virtqueue_undo_map_desc(unsigned int out_num, unsigned int in_num,
+ struct iovec *iov)
+{
+ unsigned int i;
+
+ for (i = 0; i < out_num + in_num; i++) {
+ int is_write = i >= out_num;
+
+ cpu_physical_memory_unmap(iov->iov_base, iov->iov_len, is_write, 0);
+ iov++;
+ }
+}
+
+static void virtqueue_map_iovec(VirtIODevice *vdev, struct iovec *sg,
+ hwaddr *addr, unsigned int num_sg,
+ bool is_write)
+{
+ unsigned int i;
+ hwaddr len;
+
+ for (i = 0; i < num_sg; i++) {
+ len = sg[i].iov_len;
+ sg[i].iov_base = dma_memory_map(vdev->dma_as,
+ addr[i], &len, is_write ?
+ DMA_DIRECTION_FROM_DEVICE :
+ DMA_DIRECTION_TO_DEVICE,
+ MEMTXATTRS_UNSPECIFIED);
+ if (!sg[i].iov_base) {
+ error_report("virtio: error trying to map MMIO memory");
+ exit(1);
+ }
+ if (len != sg[i].iov_len) {
+ error_report("virtio: unexpected memory split");
+ exit(1);
+ }
+ }
+}
+
+void virtqueue_map(VirtIODevice *vdev, VirtQueueElement *elem)
+{
+ virtqueue_map_iovec(vdev, elem->in_sg, elem->in_addr, elem->in_num, true);
+ virtqueue_map_iovec(vdev, elem->out_sg, elem->out_addr, elem->out_num,
+ false);
+}
+
+static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num)
+{
+ VirtQueueElement *elem;
+ size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0]));
+ size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]);
+ size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]);
+ size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0]));
+ size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
+ size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
+
+ assert(sz >= sizeof(VirtQueueElement));
+ elem = g_malloc(out_sg_end);
+ trace_virtqueue_alloc_element(elem, sz, in_num, out_num);
+ elem->out_num = out_num;
+ elem->in_num = in_num;
+ elem->in_addr = (void *)elem + in_addr_ofs;
+ elem->out_addr = (void *)elem + out_addr_ofs;
+ elem->in_sg = (void *)elem + in_sg_ofs;
+ elem->out_sg = (void *)elem + out_sg_ofs;
+ return elem;
+}
+
+static void *virtqueue_split_pop(VirtQueue *vq, size_t sz)
+{
+ unsigned int i, head, max;
+ VRingMemoryRegionCaches *caches;
+ MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+ MemoryRegionCache *desc_cache;
+ int64_t len;
+ VirtIODevice *vdev = vq->vdev;
+ VirtQueueElement *elem = NULL;
+ unsigned out_num, in_num, elem_entries;
+ hwaddr addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ VRingDesc desc;
+ int rc;
+
+ RCU_READ_LOCK_GUARD();
+ if (virtio_queue_empty_rcu(vq)) {
+ goto done;
+ }
+ /* Needed after virtio_queue_empty(), see comment in
+ * virtqueue_num_heads(). */
+ smp_rmb();
+
+ /* When we start there are none of either input nor output. */
+ out_num = in_num = elem_entries = 0;
+
+ max = vq->vring.num;
+
+ if (vq->inuse >= vq->vring.num) {
+ virtio_error(vdev, "Virtqueue size exceeded");
+ goto done;
+ }
+
+ if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) {
+ goto done;
+ }
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ vring_set_avail_event(vq, vq->last_avail_idx);
+ }
+
+ i = head;
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ virtio_error(vdev, "Region caches not initialized");
+ goto done;
+ }
+
+ if (caches->desc.len < max * sizeof(VRingDesc)) {
+ virtio_error(vdev, "Cannot map descriptor ring");
+ goto done;
+ }
+
+ desc_cache = &caches->desc;
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (!desc.len || (desc.len % sizeof(VRingDesc))) {
+ virtio_error(vdev, "Invalid size for indirect buffer table");
+ goto done;
+ }
+
+ /* loop over the indirect descriptor table */
+ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
+ desc.addr, desc.len, false);
+ desc_cache = &indirect_desc_cache;
+ if (len < desc.len) {
+ virtio_error(vdev, "Cannot map indirect buffer");
+ goto done;
+ }
+
+ max = desc.len / sizeof(VRingDesc);
+ i = 0;
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+ }
+
+ /* Collect all the descriptors */
+ do {
+ bool map_ok;
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
+ iov + out_num,
+ VIRTQUEUE_MAX_SIZE - out_num, true,
+ desc.addr, desc.len);
+ } else {
+ if (in_num) {
+ virtio_error(vdev, "Incorrect order for descriptors");
+ goto err_undo_map;
+ }
+ map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
+ VIRTQUEUE_MAX_SIZE, false,
+ desc.addr, desc.len);
+ }
+ if (!map_ok) {
+ goto err_undo_map;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (++elem_entries > max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err_undo_map;
+ }
+
+ rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, &i);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ if (rc == VIRTQUEUE_READ_DESC_ERROR) {
+ goto err_undo_map;
+ }
+
+ /* Now copy what we have collected and mapped */
+ elem = virtqueue_alloc_element(sz, out_num, in_num);
+ elem->index = head;
+ elem->ndescs = 1;
+ for (i = 0; i < out_num; i++) {
+ elem->out_addr[i] = addr[i];
+ elem->out_sg[i] = iov[i];
+ }
+ for (i = 0; i < in_num; i++) {
+ elem->in_addr[i] = addr[out_num + i];
+ elem->in_sg[i] = iov[out_num + i];
+ }
+
+ vq->inuse++;
+
+ trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
+done:
+ address_space_cache_destroy(&indirect_desc_cache);
+
+ return elem;
+
+err_undo_map:
+ virtqueue_undo_map_desc(out_num, in_num, iov);
+ goto done;
+}
+
+static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz)
+{
+ unsigned int i, max;
+ VRingMemoryRegionCaches *caches;
+ MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+ MemoryRegionCache *desc_cache;
+ int64_t len;
+ VirtIODevice *vdev = vq->vdev;
+ VirtQueueElement *elem = NULL;
+ unsigned out_num, in_num, elem_entries;
+ hwaddr addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ VRingPackedDesc desc;
+ uint16_t id;
+ int rc;
+
+ RCU_READ_LOCK_GUARD();
+ if (virtio_queue_packed_empty_rcu(vq)) {
+ goto done;
+ }
+
+ /* When we start there are none of either input nor output. */
+ out_num = in_num = elem_entries = 0;
+
+ max = vq->vring.num;
+
+ if (vq->inuse >= vq->vring.num) {
+ virtio_error(vdev, "Virtqueue size exceeded");
+ goto done;
+ }
+
+ i = vq->last_avail_idx;
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ virtio_error(vdev, "Region caches not initialized");
+ goto done;
+ }
+
+ if (caches->desc.len < max * sizeof(VRingDesc)) {
+ virtio_error(vdev, "Cannot map descriptor ring");
+ goto done;
+ }
+
+ desc_cache = &caches->desc;
+ vring_packed_desc_read(vdev, &desc, desc_cache, i, true);
+ id = desc.id;
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (desc.len % sizeof(VRingPackedDesc)) {
+ virtio_error(vdev, "Invalid size for indirect buffer table");
+ goto done;
+ }
+
+ /* loop over the indirect descriptor table */
+ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
+ desc.addr, desc.len, false);
+ desc_cache = &indirect_desc_cache;
+ if (len < desc.len) {
+ virtio_error(vdev, "Cannot map indirect buffer");
+ goto done;
+ }
+
+ max = desc.len / sizeof(VRingPackedDesc);
+ i = 0;
+ vring_packed_desc_read(vdev, &desc, desc_cache, i, false);
+ }
+
+ /* Collect all the descriptors */
+ do {
+ bool map_ok;
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num,
+ iov + out_num,
+ VIRTQUEUE_MAX_SIZE - out_num, true,
+ desc.addr, desc.len);
+ } else {
+ if (in_num) {
+ virtio_error(vdev, "Incorrect order for descriptors");
+ goto err_undo_map;
+ }
+ map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
+ VIRTQUEUE_MAX_SIZE, false,
+ desc.addr, desc.len);
+ }
+ if (!map_ok) {
+ goto err_undo_map;
+ }
+
+ /* If we've got too many, that implies a descriptor loop. */
+ if (++elem_entries > max) {
+ virtio_error(vdev, "Looped descriptor");
+ goto err_undo_map;
+ }
+
+ rc = virtqueue_packed_read_next_desc(vq, &desc, desc_cache, max, &i,
+ desc_cache ==
+ &indirect_desc_cache);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+
+ /* Now copy what we have collected and mapped */
+ elem = virtqueue_alloc_element(sz, out_num, in_num);
+ for (i = 0; i < out_num; i++) {
+ elem->out_addr[i] = addr[i];
+ elem->out_sg[i] = iov[i];
+ }
+ for (i = 0; i < in_num; i++) {
+ elem->in_addr[i] = addr[out_num + i];
+ elem->in_sg[i] = iov[out_num + i];
+ }
+
+ elem->index = id;
+ elem->ndescs = (desc_cache == &indirect_desc_cache) ? 1 : elem_entries;
+ vq->last_avail_idx += elem->ndescs;
+ vq->inuse += elem->ndescs;
+
+ if (vq->last_avail_idx >= vq->vring.num) {
+ vq->last_avail_idx -= vq->vring.num;
+ vq->last_avail_wrap_counter ^= 1;
+ }
+
+ vq->shadow_avail_idx = vq->last_avail_idx;
+ vq->shadow_avail_wrap_counter = vq->last_avail_wrap_counter;
+
+ trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
+done:
+ address_space_cache_destroy(&indirect_desc_cache);
+
+ return elem;
+
+err_undo_map:
+ virtqueue_undo_map_desc(out_num, in_num, iov);
+ goto done;
+}
+
+void *virtqueue_pop(VirtQueue *vq, size_t sz)
+{
+ if (virtio_device_disabled(vq->vdev)) {
+ return NULL;
+ }
+
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+ return virtqueue_packed_pop(vq, sz);
+ } else {
+ return virtqueue_split_pop(vq, sz);
+ }
+}
+
+static unsigned int virtqueue_packed_drop_all(VirtQueue *vq)
+{
+ VRingMemoryRegionCaches *caches;
+ MemoryRegionCache *desc_cache;
+ unsigned int dropped = 0;
+ VirtQueueElement elem = {};
+ VirtIODevice *vdev = vq->vdev;
+ VRingPackedDesc desc;
+
+ RCU_READ_LOCK_GUARD();
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return 0;
+ }
+
+ desc_cache = &caches->desc;
+
+ virtio_queue_set_notification(vq, 0);
+
+ while (vq->inuse < vq->vring.num) {
+ unsigned int idx = vq->last_avail_idx;
+ /*
+ * works similar to virtqueue_pop but does not map buffers
+ * and does not allocate any memory.
+ */
+ vring_packed_desc_read(vdev, &desc, desc_cache,
+ vq->last_avail_idx , true);
+ if (!is_desc_avail(desc.flags, vq->last_avail_wrap_counter)) {
+ break;
+ }
+ elem.index = desc.id;
+ elem.ndescs = 1;
+ while (virtqueue_packed_read_next_desc(vq, &desc, desc_cache,
+ vq->vring.num, &idx, false)) {
+ ++elem.ndescs;
+ }
+ /*
+ * immediately push the element, nothing to unmap
+ * as both in_num and out_num are set to 0.
+ */
+ virtqueue_push(vq, &elem, 0);
+ dropped++;
+ vq->last_avail_idx += elem.ndescs;
+ if (vq->last_avail_idx >= vq->vring.num) {
+ vq->last_avail_idx -= vq->vring.num;
+ vq->last_avail_wrap_counter ^= 1;
+ }
+ }
+
+ return dropped;
+}
+
+static unsigned int virtqueue_split_drop_all(VirtQueue *vq)
+{
+ unsigned int dropped = 0;
+ VirtQueueElement elem = {};
+ VirtIODevice *vdev = vq->vdev;
+ bool fEventIdx = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+
+ while (!virtio_queue_empty(vq) && vq->inuse < vq->vring.num) {
+ /* works similar to virtqueue_pop but does not map buffers
+ * and does not allocate any memory */
+ smp_rmb();
+ if (!virtqueue_get_head(vq, vq->last_avail_idx, &elem.index)) {
+ break;
+ }
+ vq->inuse++;
+ vq->last_avail_idx++;
+ if (fEventIdx) {
+ vring_set_avail_event(vq, vq->last_avail_idx);
+ }
+ /* immediately push the element, nothing to unmap
+ * as both in_num and out_num are set to 0 */
+ virtqueue_push(vq, &elem, 0);
+ dropped++;
+ }
+
+ return dropped;
+}
+
+/* virtqueue_drop_all:
+ * @vq: The #VirtQueue
+ * Drops all queued buffers and indicates them to the guest
+ * as if they are done. Useful when buffers can not be
+ * processed but must be returned to the guest.
+ */
+unsigned int virtqueue_drop_all(VirtQueue *vq)
+{
+ struct VirtIODevice *vdev = vq->vdev;
+
+ if (virtio_device_disabled(vq->vdev)) {
+ return 0;
+ }
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return virtqueue_packed_drop_all(vq);
+ } else {
+ return virtqueue_split_drop_all(vq);
+ }
+}
+
+/* Reading and writing a structure directly to QEMUFile is *awful*, but
+ * it is what QEMU has always done by mistake. We can change it sooner
+ * or later by bumping the version number of the affected vm states.
+ * In the meanwhile, since the in-memory layout of VirtQueueElement
+ * has changed, we need to marshal to and from the layout that was
+ * used before the change.
+ */
+typedef struct VirtQueueElementOld {
+ unsigned int index;
+ unsigned int out_num;
+ unsigned int in_num;
+ hwaddr in_addr[VIRTQUEUE_MAX_SIZE];
+ hwaddr out_addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
+ struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
+} VirtQueueElementOld;
+
+void *qemu_get_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, size_t sz)
+{
+ VirtQueueElement *elem;
+ VirtQueueElementOld data;
+ int i;
+
+ qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
+
+ /* TODO: teach all callers that this can fail, and return failure instead
+ * of asserting here.
+ * This is just one thing (there are probably more) that must be
+ * fixed before we can allow NDEBUG compilation.
+ */
+ assert(ARRAY_SIZE(data.in_addr) >= data.in_num);
+ assert(ARRAY_SIZE(data.out_addr) >= data.out_num);
+
+ elem = virtqueue_alloc_element(sz, data.out_num, data.in_num);
+ elem->index = data.index;
+
+ for (i = 0; i < elem->in_num; i++) {
+ elem->in_addr[i] = data.in_addr[i];
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ elem->out_addr[i] = data.out_addr[i];
+ }
+
+ for (i = 0; i < elem->in_num; i++) {
+ /* Base is overwritten by virtqueue_map. */
+ elem->in_sg[i].iov_base = 0;
+ elem->in_sg[i].iov_len = data.in_sg[i].iov_len;
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ /* Base is overwritten by virtqueue_map. */
+ elem->out_sg[i].iov_base = 0;
+ elem->out_sg[i].iov_len = data.out_sg[i].iov_len;
+ }
+
+ if (virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ qemu_get_be32s(f, &elem->ndescs);
+ }
+
+ virtqueue_map(vdev, elem);
+ return elem;
+}
+
+void qemu_put_virtqueue_element(VirtIODevice *vdev, QEMUFile *f,
+ VirtQueueElement *elem)
+{
+ VirtQueueElementOld data;
+ int i;
+
+ memset(&data, 0, sizeof(data));
+ data.index = elem->index;
+ data.in_num = elem->in_num;
+ data.out_num = elem->out_num;
+
+ for (i = 0; i < elem->in_num; i++) {
+ data.in_addr[i] = elem->in_addr[i];
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ data.out_addr[i] = elem->out_addr[i];
+ }
+
+ for (i = 0; i < elem->in_num; i++) {
+ /* Base is overwritten by virtqueue_map when loading. Do not
+ * save it, as it would leak the QEMU address space layout. */
+ data.in_sg[i].iov_len = elem->in_sg[i].iov_len;
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ /* Do not save iov_base as above. */
+ data.out_sg[i].iov_len = elem->out_sg[i].iov_len;
+ }
+
+ if (virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ qemu_put_be32s(f, &elem->ndescs);
+ }
+
+ qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
+}
+
+/* virtio device */
+static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (virtio_device_disabled(vdev)) {
+ return;
+ }
+
+ if (k->notify) {
+ k->notify(qbus->parent, vector);
+ }
+}
+
+void virtio_update_irq(VirtIODevice *vdev)
+{
+ virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
+}
+
+static int virtio_validate_features(VirtIODevice *vdev)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM) &&
+ !virtio_vdev_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
+ return -EFAULT;
+ }
+
+ if (k->validate_features) {
+ return k->validate_features(vdev);
+ } else {
+ return 0;
+ }
+}
+
+int virtio_set_status(VirtIODevice *vdev, uint8_t val)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ trace_virtio_set_status(vdev, val);
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) &&
+ val & VIRTIO_CONFIG_S_FEATURES_OK) {
+ int ret = virtio_validate_features(vdev);
+
+ if (ret) {
+ return ret;
+ }
+ }
+ }
+
+ if ((vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) !=
+ (val & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK);
+ }
+
+ if (k->set_status) {
+ k->set_status(vdev, val);
+ }
+ vdev->status = val;
+
+ return 0;
+}
+
+static enum virtio_device_endian virtio_default_endian(void)
+{
+ if (target_words_bigendian()) {
+ return VIRTIO_DEVICE_ENDIAN_BIG;
+ } else {
+ return VIRTIO_DEVICE_ENDIAN_LITTLE;
+ }
+}
+
+static enum virtio_device_endian virtio_current_cpu_endian(void)
+{
+ if (cpu_virtio_is_big_endian(current_cpu)) {
+ return VIRTIO_DEVICE_ENDIAN_BIG;
+ } else {
+ return VIRTIO_DEVICE_ENDIAN_LITTLE;
+ }
+}
+
+static void __virtio_queue_reset(VirtIODevice *vdev, uint32_t i)
+{
+ vdev->vq[i].vring.desc = 0;
+ vdev->vq[i].vring.avail = 0;
+ vdev->vq[i].vring.used = 0;
+ vdev->vq[i].last_avail_idx = 0;
+ vdev->vq[i].shadow_avail_idx = 0;
+ vdev->vq[i].used_idx = 0;
+ vdev->vq[i].last_avail_wrap_counter = true;
+ vdev->vq[i].shadow_avail_wrap_counter = true;
+ vdev->vq[i].used_wrap_counter = true;
+ virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
+ vdev->vq[i].signalled_used = 0;
+ vdev->vq[i].signalled_used_valid = false;
+ vdev->vq[i].notification = true;
+ vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
+ vdev->vq[i].inuse = 0;
+ virtio_virtqueue_reset_region_cache(&vdev->vq[i]);
+}
+
+void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ if (k->queue_reset) {
+ k->queue_reset(vdev, queue_index);
+ }
+
+ __virtio_queue_reset(vdev, queue_index);
+}
+
+void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ /*
+ * TODO: Seabios is currently out of spec and triggering this error.
+ * So this needs to be fixed in Seabios, then this can
+ * be re-enabled for new machine types only, and also after
+ * being converted to LOG_GUEST_ERROR.
+ *
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ error_report("queue_enable is only suppported in devices of virtio "
+ "1.0 or later.");
+ }
+ */
+
+ if (k->queue_enable) {
+ k->queue_enable(vdev, queue_index);
+ }
+}
+
+void virtio_reset(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ int i;
+
+ virtio_set_status(vdev, 0);
+ if (current_cpu) {
+ /* Guest initiated reset */
+ vdev->device_endian = virtio_current_cpu_endian();
+ } else {
+ /* System reset */
+ vdev->device_endian = virtio_default_endian();
+ }
+
+ if (k->reset) {
+ k->reset(vdev);
+ }
+
+ vdev->start_on_kick = false;
+ vdev->started = false;
+ vdev->broken = false;
+ vdev->guest_features = 0;
+ vdev->queue_sel = 0;
+ vdev->status = 0;
+ vdev->disabled = false;
+ qatomic_set(&vdev->isr, 0);
+ vdev->config_vector = VIRTIO_NO_VECTOR;
+ virtio_notify_vector(vdev, vdev->config_vector);
+
+ for(i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ __virtio_queue_reset(vdev, i);
+ }
+}
+
+uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint8_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = ldub_p(vdev->config + addr);
+ return val;
+}
+
+uint32_t virtio_config_readw(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint16_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = lduw_p(vdev->config + addr);
+ return val;
+}
+
+uint32_t virtio_config_readl(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint32_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = ldl_p(vdev->config + addr);
+ return val;
+}
+
+void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint8_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stb_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint16_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stw_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint32_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stl_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+uint32_t virtio_config_modern_readb(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint8_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = ldub_p(vdev->config + addr);
+ return val;
+}
+
+uint32_t virtio_config_modern_readw(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint16_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = lduw_le_p(vdev->config + addr);
+ return val;
+}
+
+uint32_t virtio_config_modern_readl(VirtIODevice *vdev, uint32_t addr)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint32_t val;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return (uint32_t)-1;
+ }
+
+ k->get_config(vdev, vdev->config);
+
+ val = ldl_le_p(vdev->config + addr);
+ return val;
+}
+
+void virtio_config_modern_writeb(VirtIODevice *vdev,
+ uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint8_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stb_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+void virtio_config_modern_writew(VirtIODevice *vdev,
+ uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint16_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stw_le_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+void virtio_config_modern_writel(VirtIODevice *vdev,
+ uint32_t addr, uint32_t data)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint32_t val = data;
+
+ if (addr + sizeof(val) > vdev->config_len) {
+ return;
+ }
+
+ stl_le_p(vdev->config + addr, val);
+
+ if (k->set_config) {
+ k->set_config(vdev, vdev->config);
+ }
+}
+
+void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr)
+{
+ if (!vdev->vq[n].vring.num) {
+ return;
+ }
+ vdev->vq[n].vring.desc = addr;
+ virtio_queue_update_rings(vdev, n);
+}
+
+hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.desc;
+}
+
+void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
+ hwaddr avail, hwaddr used)
+{
+ if (!vdev->vq[n].vring.num) {
+ return;
+ }
+ vdev->vq[n].vring.desc = desc;
+ vdev->vq[n].vring.avail = avail;
+ vdev->vq[n].vring.used = used;
+ virtio_init_region_cache(vdev, n);
+}
+
+void virtio_queue_set_num(VirtIODevice *vdev, int n, int num)
+{
+ /* Don't allow guest to flip queue between existent and
+ * nonexistent states, or to set it to an invalid size.
+ */
+ if (!!num != !!vdev->vq[n].vring.num ||
+ num > VIRTQUEUE_MAX_SIZE ||
+ num < 0) {
+ return;
+ }
+ vdev->vq[n].vring.num = num;
+}
+
+VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector)
+{
+ return QLIST_FIRST(&vdev->vector_queues[vector]);
+}
+
+VirtQueue *virtio_vector_next_queue(VirtQueue *vq)
+{
+ return QLIST_NEXT(vq, node);
+}
+
+int virtio_queue_get_num(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.num;
+}
+
+int virtio_queue_get_max_num(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.num_default;
+}
+
+int virtio_get_num_queues(VirtIODevice *vdev)
+{
+ int i;
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (!virtio_queue_get_num(vdev, i)) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+void virtio_queue_set_align(VirtIODevice *vdev, int n, int align)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ /* virtio-1 compliant devices cannot change the alignment */
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ error_report("tried to modify queue alignment for virtio-1 device");
+ return;
+ }
+ /* Check that the transport told us it was going to do this
+ * (so a buggy transport will immediately assert rather than
+ * silently failing to migrate this state)
+ */
+ assert(k->has_variable_vring_alignment);
+
+ if (align) {
+ vdev->vq[n].vring.align = align;
+ virtio_queue_update_rings(vdev, n);
+ }
+}
+
+static void virtio_queue_notify_vq(VirtQueue *vq)
+{
+ if (vq->vring.desc && vq->handle_output) {
+ VirtIODevice *vdev = vq->vdev;
+
+ if (unlikely(vdev->broken)) {
+ return;
+ }
+
+ trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+ vq->handle_output(vdev, vq);
+
+ if (unlikely(vdev->start_on_kick)) {
+ virtio_set_started(vdev, true);
+ }
+ }
+}
+
+void virtio_queue_notify(VirtIODevice *vdev, int n)
+{
+ VirtQueue *vq = &vdev->vq[n];
+
+ if (unlikely(!vq->vring.desc || vdev->broken)) {
+ return;
+ }
+
+ trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+ if (vq->host_notifier_enabled) {
+ event_notifier_set(&vq->host_notifier);
+ } else if (vq->handle_output) {
+ vq->handle_output(vdev, vq);
+
+ if (unlikely(vdev->start_on_kick)) {
+ virtio_set_started(vdev, true);
+ }
+ }
+}
+
+uint16_t virtio_queue_vector(VirtIODevice *vdev, int n)
+{
+ return n < VIRTIO_QUEUE_MAX ? vdev->vq[n].vector :
+ VIRTIO_NO_VECTOR;
+}
+
+void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector)
+{
+ VirtQueue *vq = &vdev->vq[n];
+
+ if (n < VIRTIO_QUEUE_MAX) {
+ if (vdev->vector_queues &&
+ vdev->vq[n].vector != VIRTIO_NO_VECTOR) {
+ QLIST_REMOVE(vq, node);
+ }
+ vdev->vq[n].vector = vector;
+ if (vdev->vector_queues &&
+ vector != VIRTIO_NO_VECTOR) {
+ QLIST_INSERT_HEAD(&vdev->vector_queues[vector], vq, node);
+ }
+ }
+}
+
+VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
+ VirtIOHandleOutput handle_output)
+{
+ int i;
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num == 0)
+ break;
+ }
+
+ if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE)
+ abort();
+
+ vdev->vq[i].vring.num = queue_size;
+ vdev->vq[i].vring.num_default = queue_size;
+ vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
+ vdev->vq[i].handle_output = handle_output;
+ vdev->vq[i].used_elems = g_new0(VirtQueueElement, queue_size);
+
+ return &vdev->vq[i];
+}
+
+void virtio_delete_queue(VirtQueue *vq)
+{
+ vq->vring.num = 0;
+ vq->vring.num_default = 0;
+ vq->handle_output = NULL;
+ g_free(vq->used_elems);
+ vq->used_elems = NULL;
+ virtio_virtqueue_reset_region_cache(vq);
+}
+
+void virtio_del_queue(VirtIODevice *vdev, int n)
+{
+ if (n < 0 || n >= VIRTIO_QUEUE_MAX) {
+ abort();
+ }
+
+ virtio_delete_queue(&vdev->vq[n]);
+}
+
+static void virtio_set_isr(VirtIODevice *vdev, int value)
+{
+ uint8_t old = qatomic_read(&vdev->isr);
+
+ /* Do not write ISR if it does not change, so that its cacheline remains
+ * shared in the common case where the guest does not read it.
+ */
+ if ((old & value) != value) {
+ qatomic_or(&vdev->isr, value);
+ }
+}
+
+/* Called within rcu_read_lock(). */
+static bool virtio_split_should_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+ uint16_t old, new;
+ bool v;
+ /* We need to expose used array entries before checking used event. */
+ smp_mb();
+ /* Always notify when queue is empty (when feature acknowledge) */
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+ !vq->inuse && virtio_queue_empty(vq)) {
+ return true;
+ }
+
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+
+ v = vq->signalled_used_valid;
+ vq->signalled_used_valid = true;
+ old = vq->signalled_used;
+ new = vq->signalled_used = vq->used_idx;
+ return !v || vring_need_event(vring_get_used_event(vq), new, old);
+}
+
+static bool vring_packed_need_event(VirtQueue *vq, bool wrap,
+ uint16_t off_wrap, uint16_t new,
+ uint16_t old)
+{
+ int off = off_wrap & ~(1 << 15);
+
+ if (wrap != off_wrap >> 15) {
+ off -= vq->vring.num;
+ }
+
+ return vring_need_event(off, new, old);
+}
+
+/* Called within rcu_read_lock(). */
+static bool virtio_packed_should_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VRingPackedDescEvent e;
+ uint16_t old, new;
+ bool v;
+ VRingMemoryRegionCaches *caches;
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ return false;
+ }
+
+ vring_packed_event_read(vdev, &caches->avail, &e);
+
+ old = vq->signalled_used;
+ new = vq->signalled_used = vq->used_idx;
+ v = vq->signalled_used_valid;
+ vq->signalled_used_valid = true;
+
+ if (e.flags == VRING_PACKED_EVENT_FLAG_DISABLE) {
+ return false;
+ } else if (e.flags == VRING_PACKED_EVENT_FLAG_ENABLE) {
+ return true;
+ }
+
+ return !v || vring_packed_need_event(vq, vq->used_wrap_counter,
+ e.off_wrap, new, old);
+}
+
+/* Called within rcu_read_lock(). */
+static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return virtio_packed_should_notify(vdev, vq);
+ } else {
+ return virtio_split_should_notify(vdev, vq);
+ }
+}
+
+void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq)
+{
+ WITH_RCU_READ_LOCK_GUARD() {
+ if (!virtio_should_notify(vdev, vq)) {
+ return;
+ }
+ }
+
+ trace_virtio_notify_irqfd(vdev, vq);
+
+ /*
+ * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but
+ * windows drivers included in virtio-win 1.8.0 (circa 2015) are
+ * incorrectly polling this bit during crashdump and hibernation
+ * in MSI mode, causing a hang if this bit is never updated.
+ * Recent releases of Windows do not really shut down, but rather
+ * log out and hibernate to make the next startup faster. Hence,
+ * this manifested as a more serious hang during shutdown with
+ *
+ * Next driver release from 2016 fixed this problem, so working around it
+ * is not a must, but it's easy to do so let's do it here.
+ *
+ * Note: it's safe to update ISR from any thread as it was switched
+ * to an atomic operation.
+ */
+ virtio_set_isr(vq->vdev, 0x1);
+ event_notifier_set(&vq->guest_notifier);
+}
+
+static void virtio_irq(VirtQueue *vq)
+{
+ virtio_set_isr(vq->vdev, 0x1);
+ virtio_notify_vector(vq->vdev, vq->vector);
+}
+
+void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
+{
+ WITH_RCU_READ_LOCK_GUARD() {
+ if (!virtio_should_notify(vdev, vq)) {
+ return;
+ }
+ }
+
+ trace_virtio_notify(vdev, vq);
+ virtio_irq(vq);
+}
+
+void virtio_notify_config(VirtIODevice *vdev)
+{
+ if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ return;
+
+ virtio_set_isr(vdev, 0x3);
+ vdev->generation++;
+ virtio_notify_vector(vdev, vdev->config_vector);
+}
+
+static bool virtio_device_endian_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ return vdev->device_endian != virtio_default_endian();
+ }
+ /* Devices conforming to VIRTIO 1.0 or later are always LE. */
+ return vdev->device_endian != VIRTIO_DEVICE_ENDIAN_LITTLE;
+}
+
+static bool virtio_64bit_features_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return (vdev->host_features >> 32) != 0;
+}
+
+static bool virtio_virtqueue_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1);
+}
+
+static bool virtio_packed_virtqueue_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED);
+}
+
+static bool virtio_ringsize_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+ int i;
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool virtio_extra_state_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ return k->has_extra_state &&
+ k->has_extra_state(qbus->parent);
+}
+
+static bool virtio_broken_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return vdev->broken;
+}
+
+static bool virtio_started_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return vdev->started;
+}
+
+static bool virtio_disabled_needed(void *opaque)
+{
+ VirtIODevice *vdev = opaque;
+
+ return vdev->disabled;
+}
+
+static const VMStateDescription vmstate_virtqueue = {
+ .name = "virtqueue_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(vring.avail, struct VirtQueue),
+ VMSTATE_UINT64(vring.used, struct VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_packed_virtqueue = {
+ .name = "packed_virtqueue_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT16(last_avail_idx, struct VirtQueue),
+ VMSTATE_BOOL(last_avail_wrap_counter, struct VirtQueue),
+ VMSTATE_UINT16(used_idx, struct VirtQueue),
+ VMSTATE_BOOL(used_wrap_counter, struct VirtQueue),
+ VMSTATE_UINT32(inuse, struct VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_virtqueues = {
+ .name = "virtio/virtqueues",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_virtqueue_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
+ VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_packed_virtqueues = {
+ .name = "virtio/packed_virtqueues",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_packed_virtqueue_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
+ VIRTIO_QUEUE_MAX, 0, vmstate_packed_virtqueue, VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_ringsize = {
+ .name = "ringsize_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(vring.num_default, struct VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_ringsize = {
+ .name = "virtio/ringsize",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_ringsize_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
+ VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static int get_extra_state(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field)
+{
+ VirtIODevice *vdev = pv;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (!k->load_extra_state) {
+ return -1;
+ } else {
+ return k->load_extra_state(qbus->parent, f);
+ }
+}
+
+static int put_extra_state(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+ VirtIODevice *vdev = pv;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ k->save_extra_state(qbus->parent, f);
+ return 0;
+}
+
+static const VMStateInfo vmstate_info_extra_state = {
+ .name = "virtqueue_extra_state",
+ .get = get_extra_state,
+ .put = put_extra_state,
+};
+
+static const VMStateDescription vmstate_virtio_extra_state = {
+ .name = "virtio/extra_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_extra_state_needed,
+ .fields = (VMStateField[]) {
+ {
+ .name = "extra_state",
+ .version_id = 0,
+ .field_exists = NULL,
+ .size = 0,
+ .info = &vmstate_info_extra_state,
+ .flags = VMS_SINGLE,
+ .offset = 0,
+ },
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_device_endian = {
+ .name = "virtio/device_endian",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_device_endian_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT8(device_endian, VirtIODevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_64bit_features = {
+ .name = "virtio/64bit_features",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_64bit_features_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(guest_features, VirtIODevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_broken = {
+ .name = "virtio/broken",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_broken_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(broken, VirtIODevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_started = {
+ .name = "virtio/started",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_started_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(started, VirtIODevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_disabled = {
+ .name = "virtio/disabled",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_disabled_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_BOOL(disabled, VirtIODevice),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio = {
+ .name = "virtio",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription*[]) {
+ &vmstate_virtio_device_endian,
+ &vmstate_virtio_64bit_features,
+ &vmstate_virtio_virtqueues,
+ &vmstate_virtio_ringsize,
+ &vmstate_virtio_broken,
+ &vmstate_virtio_extra_state,
+ &vmstate_virtio_started,
+ &vmstate_virtio_packed_virtqueues,
+ &vmstate_virtio_disabled,
+ NULL
+ }
+};
+
+int virtio_save(VirtIODevice *vdev, QEMUFile *f)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ uint32_t guest_features_lo = (vdev->guest_features & 0xffffffff);
+ int i;
+
+ if (k->save_config) {
+ k->save_config(qbus->parent, f);
+ }
+
+ qemu_put_8s(f, &vdev->status);
+ qemu_put_8s(f, &vdev->isr);
+ qemu_put_be16s(f, &vdev->queue_sel);
+ qemu_put_be32s(f, &guest_features_lo);
+ qemu_put_be32(f, vdev->config_len);
+ qemu_put_buffer(f, vdev->config, vdev->config_len);
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num == 0)
+ break;
+ }
+
+ qemu_put_be32(f, i);
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num == 0)
+ break;
+
+ qemu_put_be32(f, vdev->vq[i].vring.num);
+ if (k->has_variable_vring_alignment) {
+ qemu_put_be32(f, vdev->vq[i].vring.align);
+ }
+ /*
+ * Save desc now, the rest of the ring addresses are saved in
+ * subsections for VIRTIO-1 devices.
+ */
+ qemu_put_be64(f, vdev->vq[i].vring.desc);
+ qemu_put_be16s(f, &vdev->vq[i].last_avail_idx);
+ if (k->save_queue) {
+ k->save_queue(qbus->parent, i, f);
+ }
+ }
+
+ if (vdc->save != NULL) {
+ vdc->save(vdev, f);
+ }
+
+ if (vdc->vmsd) {
+ int ret = vmstate_save_state(f, vdc->vmsd, vdev, NULL);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ /* Subsections */
+ return vmstate_save_state(f, &vmstate_virtio, vdev, NULL);
+}
+
+/* A wrapper for use as a VMState .put function */
+static int virtio_device_put(QEMUFile *f, void *opaque, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+ return virtio_save(VIRTIO_DEVICE(opaque), f);
+}
+
+/* A wrapper for use as a VMState .get function */
+static int virtio_device_get(QEMUFile *f, void *opaque, size_t size,
+ const VMStateField *field)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
+ DeviceClass *dc = DEVICE_CLASS(VIRTIO_DEVICE_GET_CLASS(vdev));
+
+ return virtio_load(vdev, f, dc->vmsd->version_id);
+}
+
+const VMStateInfo virtio_vmstate_info = {
+ .name = "virtio",
+ .get = virtio_device_get,
+ .put = virtio_device_put,
+};
+
+static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val)
+{
+ VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+ bool bad = (val & ~(vdev->host_features)) != 0;
+
+ val &= vdev->host_features;
+ if (k->set_features) {
+ k->set_features(vdev, val);
+ }
+ vdev->guest_features = val;
+ return bad ? -1 : 0;
+}
+
+int virtio_set_features(VirtIODevice *vdev, uint64_t val)
+{
+ int ret;
+ /*
+ * The driver must not attempt to set features after feature negotiation
+ * has finished.
+ */
+ if (vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) {
+ return -EINVAL;
+ }
+
+ if (val & (1ull << VIRTIO_F_BAD_FEATURE)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "%s: guest driver for %s has enabled UNUSED(30) feature bit!\n",
+ __func__, vdev->name);
+ }
+
+ ret = virtio_set_features_nocheck(vdev, val);
+ if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches. */
+ int i;
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num != 0) {
+ virtio_init_region_cache(vdev, i);
+ }
+ }
+ }
+ if (!ret) {
+ if (!virtio_device_started(vdev, vdev->status) &&
+ !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ vdev->start_on_kick = true;
+ }
+ }
+ return ret;
+}
+
+size_t virtio_get_config_size(const VirtIOConfigSizeParams *params,
+ uint64_t host_features)
+{
+ size_t config_size = params->min_size;
+ const VirtIOFeature *feature_sizes = params->feature_sizes;
+ size_t i;
+
+ for (i = 0; feature_sizes[i].flags != 0; i++) {
+ if (host_features & feature_sizes[i].flags) {
+ config_size = MAX(feature_sizes[i].end, config_size);
+ }
+ }
+
+ assert(config_size <= params->max_size);
+ return config_size;
+}
+
+int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
+{
+ int i, ret;
+ int32_t config_len;
+ uint32_t num;
+ uint32_t features;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+ /*
+ * We poison the endianness to ensure it does not get used before
+ * subsections have been loaded.
+ */
+ vdev->device_endian = VIRTIO_DEVICE_ENDIAN_UNKNOWN;
+
+ if (k->load_config) {
+ ret = k->load_config(qbus->parent, f);
+ if (ret)
+ return ret;
+ }
+
+ qemu_get_8s(f, &vdev->status);
+ qemu_get_8s(f, &vdev->isr);
+ qemu_get_be16s(f, &vdev->queue_sel);
+ if (vdev->queue_sel >= VIRTIO_QUEUE_MAX) {
+ return -1;
+ }
+ qemu_get_be32s(f, &features);
+
+ /*
+ * Temporarily set guest_features low bits - needed by
+ * virtio net load code testing for VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
+ * VIRTIO_NET_F_GUEST_ANNOUNCE and VIRTIO_NET_F_CTRL_VQ.
+ *
+ * Note: devices should always test host features in future - don't create
+ * new dependencies like this.
+ */
+ vdev->guest_features = features;
+
+ config_len = qemu_get_be32(f);
+
+ /*
+ * There are cases where the incoming config can be bigger or smaller
+ * than what we have; so load what we have space for, and skip
+ * any excess that's in the stream.
+ */
+ qemu_get_buffer(f, vdev->config, MIN(config_len, vdev->config_len));
+
+ while (config_len > vdev->config_len) {
+ qemu_get_byte(f);
+ config_len--;
+ }
+
+ num = qemu_get_be32(f);
+
+ if (num > VIRTIO_QUEUE_MAX) {
+ error_report("Invalid number of virtqueues: 0x%x", num);
+ return -1;
+ }
+
+ for (i = 0; i < num; i++) {
+ vdev->vq[i].vring.num = qemu_get_be32(f);
+ if (k->has_variable_vring_alignment) {
+ vdev->vq[i].vring.align = qemu_get_be32(f);
+ }
+ vdev->vq[i].vring.desc = qemu_get_be64(f);
+ qemu_get_be16s(f, &vdev->vq[i].last_avail_idx);
+ vdev->vq[i].signalled_used_valid = false;
+ vdev->vq[i].notification = true;
+
+ if (!vdev->vq[i].vring.desc && vdev->vq[i].last_avail_idx) {
+ error_report("VQ %d address 0x0 "
+ "inconsistent with Host index 0x%x",
+ i, vdev->vq[i].last_avail_idx);
+ return -1;
+ }
+ if (k->load_queue) {
+ ret = k->load_queue(qbus->parent, i, f);
+ if (ret)
+ return ret;
+ }
+ }
+
+ virtio_notify_vector(vdev, VIRTIO_NO_VECTOR);
+
+ if (vdc->load != NULL) {
+ ret = vdc->load(vdev, f, version_id);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ if (vdc->vmsd) {
+ ret = vmstate_load_state(f, vdc->vmsd, vdev, version_id);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ /* Subsections */
+ ret = vmstate_load_state(f, &vmstate_virtio, vdev, 1);
+ if (ret) {
+ return ret;
+ }
+
+ if (vdev->device_endian == VIRTIO_DEVICE_ENDIAN_UNKNOWN) {
+ vdev->device_endian = virtio_default_endian();
+ }
+
+ if (virtio_64bit_features_needed(vdev)) {
+ /*
+ * Subsection load filled vdev->guest_features. Run them
+ * through virtio_set_features to sanity-check them against
+ * host_features.
+ */
+ uint64_t features64 = vdev->guest_features;
+ if (virtio_set_features_nocheck(vdev, features64) < 0) {
+ error_report("Features 0x%" PRIx64 " unsupported. "
+ "Allowed features: 0x%" PRIx64,
+ features64, vdev->host_features);
+ return -1;
+ }
+ } else {
+ if (virtio_set_features_nocheck(vdev, features) < 0) {
+ error_report("Features 0x%x unsupported. "
+ "Allowed features: 0x%" PRIx64,
+ features, vdev->host_features);
+ return -1;
+ }
+ }
+
+ if (!virtio_device_started(vdev, vdev->status) &&
+ !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ vdev->start_on_kick = true;
+ }
+
+ RCU_READ_LOCK_GUARD();
+ for (i = 0; i < num; i++) {
+ if (vdev->vq[i].vring.desc) {
+ uint16_t nheads;
+
+ /*
+ * VIRTIO-1 devices migrate desc, used, and avail ring addresses so
+ * only the region cache needs to be set up. Legacy devices need
+ * to calculate used and avail ring addresses based on the desc
+ * address.
+ */
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ virtio_init_region_cache(vdev, i);
+ } else {
+ virtio_queue_update_rings(vdev, i);
+ }
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ vdev->vq[i].shadow_avail_idx = vdev->vq[i].last_avail_idx;
+ vdev->vq[i].shadow_avail_wrap_counter =
+ vdev->vq[i].last_avail_wrap_counter;
+ continue;
+ }
+
+ nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx;
+ /* Check it isn't doing strange things with descriptor numbers. */
+ if (nheads > vdev->vq[i].vring.num) {
+ virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x "
+ "inconsistent with Host index 0x%x: delta 0x%x",
+ i, vdev->vq[i].vring.num,
+ vring_avail_idx(&vdev->vq[i]),
+ vdev->vq[i].last_avail_idx, nheads);
+ vdev->vq[i].used_idx = 0;
+ vdev->vq[i].shadow_avail_idx = 0;
+ vdev->vq[i].inuse = 0;
+ continue;
+ }
+ vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]);
+ vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]);
+
+ /*
+ * Some devices migrate VirtQueueElements that have been popped
+ * from the avail ring but not yet returned to the used ring.
+ * Since max ring size < UINT16_MAX it's safe to use modulo
+ * UINT16_MAX + 1 subtraction.
+ */
+ vdev->vq[i].inuse = (uint16_t)(vdev->vq[i].last_avail_idx -
+ vdev->vq[i].used_idx);
+ if (vdev->vq[i].inuse > vdev->vq[i].vring.num) {
+ error_report("VQ %d size 0x%x < last_avail_idx 0x%x - "
+ "used_idx 0x%x",
+ i, vdev->vq[i].vring.num,
+ vdev->vq[i].last_avail_idx,
+ vdev->vq[i].used_idx);
+ return -1;
+ }
+ }
+ }
+
+ if (vdc->post_load) {
+ ret = vdc->post_load(vdev);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+void virtio_cleanup(VirtIODevice *vdev)
+{
+ qemu_del_vm_change_state_handler(vdev->vmstate);
+}
+
+static void virtio_vmstate_change(void *opaque, bool running, RunState state)
+{
+ VirtIODevice *vdev = opaque;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ bool backend_run = running && virtio_device_started(vdev, vdev->status);
+ vdev->vm_running = running;
+
+ if (backend_run) {
+ virtio_set_status(vdev, vdev->status);
+ }
+
+ if (k->vmstate_change) {
+ k->vmstate_change(qbus->parent, backend_run);
+ }
+
+ if (!backend_run) {
+ virtio_set_status(vdev, vdev->status);
+ }
+}
+
+void virtio_instance_init_common(Object *proxy_obj, void *data,
+ size_t vdev_size, const char *vdev_name)
+{
+ DeviceState *vdev = data;
+
+ object_initialize_child_with_props(proxy_obj, "virtio-backend", vdev,
+ vdev_size, vdev_name, &error_abort,
+ NULL);
+ qdev_alias_all_properties(vdev, proxy_obj);
+}
+
+void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ int i;
+ int nvectors = k->query_nvectors ? k->query_nvectors(qbus->parent) : 0;
+
+ if (nvectors) {
+ vdev->vector_queues =
+ g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
+ }
+
+ vdev->start_on_kick = false;
+ vdev->started = false;
+ vdev->vhost_started = false;
+ vdev->device_id = device_id;
+ vdev->status = 0;
+ qatomic_set(&vdev->isr, 0);
+ vdev->queue_sel = 0;
+ vdev->config_vector = VIRTIO_NO_VECTOR;
+ vdev->vq = g_new0(VirtQueue, VIRTIO_QUEUE_MAX);
+ vdev->vm_running = runstate_is_running();
+ vdev->broken = false;
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ vdev->vq[i].vector = VIRTIO_NO_VECTOR;
+ vdev->vq[i].vdev = vdev;
+ vdev->vq[i].queue_index = i;
+ vdev->vq[i].host_notifier_enabled = false;
+ }
+
+ vdev->name = virtio_id_to_name(device_id);
+ vdev->config_len = config_size;
+ if (vdev->config_len) {
+ vdev->config = g_malloc0(config_size);
+ } else {
+ vdev->config = NULL;
+ }
+ vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev),
+ virtio_vmstate_change, vdev);
+ vdev->device_endian = virtio_default_endian();
+ vdev->use_guest_notifier_mask = true;
+}
+
+/*
+ * Only devices that have already been around prior to defining the virtio
+ * standard support legacy mode; this includes devices not specified in the
+ * standard. All newer devices conform to the virtio standard only.
+ */
+bool virtio_legacy_allowed(VirtIODevice *vdev)
+{
+ switch (vdev->device_id) {
+ case VIRTIO_ID_NET:
+ case VIRTIO_ID_BLOCK:
+ case VIRTIO_ID_CONSOLE:
+ case VIRTIO_ID_RNG:
+ case VIRTIO_ID_BALLOON:
+ case VIRTIO_ID_RPMSG:
+ case VIRTIO_ID_SCSI:
+ case VIRTIO_ID_9P:
+ case VIRTIO_ID_RPROC_SERIAL:
+ case VIRTIO_ID_CAIF:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool virtio_legacy_check_disabled(VirtIODevice *vdev)
+{
+ return vdev->disable_legacy_check;
+}
+
+hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.desc;
+}
+
+bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n)
+{
+ return virtio_queue_get_desc_addr(vdev, n) != 0;
+}
+
+bool virtio_queue_enabled(VirtIODevice *vdev, int n)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (k->queue_enabled) {
+ return k->queue_enabled(qbus->parent, n);
+ }
+ return virtio_queue_enabled_legacy(vdev, n);
+}
+
+hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.avail;
+}
+
+hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n)
+{
+ return vdev->vq[n].vring.used;
+}
+
+hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n)
+{
+ return sizeof(VRingDesc) * vdev->vq[n].vring.num;
+}
+
+hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n)
+{
+ int s;
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return sizeof(struct VRingPackedDescEvent);
+ }
+
+ s = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+ return offsetof(VRingAvail, ring) +
+ sizeof(uint16_t) * vdev->vq[n].vring.num + s;
+}
+
+hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n)
+{
+ int s;
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return sizeof(struct VRingPackedDescEvent);
+ }
+
+ s = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+ return offsetof(VRingUsed, ring) +
+ sizeof(VRingUsedElem) * vdev->vq[n].vring.num + s;
+}
+
+static unsigned int virtio_queue_packed_get_last_avail_idx(VirtIODevice *vdev,
+ int n)
+{
+ unsigned int avail, used;
+
+ avail = vdev->vq[n].last_avail_idx;
+ avail |= ((uint16_t)vdev->vq[n].last_avail_wrap_counter) << 15;
+
+ used = vdev->vq[n].used_idx;
+ used |= ((uint16_t)vdev->vq[n].used_wrap_counter) << 15;
+
+ return avail | used << 16;
+}
+
+static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev,
+ int n)
+{
+ return vdev->vq[n].last_avail_idx;
+}
+
+unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return virtio_queue_packed_get_last_avail_idx(vdev, n);
+ } else {
+ return virtio_queue_split_get_last_avail_idx(vdev, n);
+ }
+}
+
+static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev,
+ int n, unsigned int idx)
+{
+ struct VirtQueue *vq = &vdev->vq[n];
+
+ vq->last_avail_idx = vq->shadow_avail_idx = idx & 0x7fff;
+ vq->last_avail_wrap_counter =
+ vq->shadow_avail_wrap_counter = !!(idx & 0x8000);
+ idx >>= 16;
+ vq->used_idx = idx & 0x7ffff;
+ vq->used_wrap_counter = !!(idx & 0x8000);
+}
+
+static void virtio_queue_split_set_last_avail_idx(VirtIODevice *vdev,
+ int n, unsigned int idx)
+{
+ vdev->vq[n].last_avail_idx = idx;
+ vdev->vq[n].shadow_avail_idx = idx;
+}
+
+void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n,
+ unsigned int idx)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ virtio_queue_packed_set_last_avail_idx(vdev, n, idx);
+ } else {
+ virtio_queue_split_set_last_avail_idx(vdev, n, idx);
+ }
+}
+
+static void virtio_queue_packed_restore_last_avail_idx(VirtIODevice *vdev,
+ int n)
+{
+ /* We don't have a reference like avail idx in shared memory */
+ return;
+}
+
+static void virtio_queue_split_restore_last_avail_idx(VirtIODevice *vdev,
+ int n)
+{
+ RCU_READ_LOCK_GUARD();
+ if (vdev->vq[n].vring.desc) {
+ vdev->vq[n].last_avail_idx = vring_used_idx(&vdev->vq[n]);
+ vdev->vq[n].shadow_avail_idx = vdev->vq[n].last_avail_idx;
+ }
+}
+
+void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ virtio_queue_packed_restore_last_avail_idx(vdev, n);
+ } else {
+ virtio_queue_split_restore_last_avail_idx(vdev, n);
+ }
+}
+
+static void virtio_queue_packed_update_used_idx(VirtIODevice *vdev, int n)
+{
+ /* used idx was updated through set_last_avail_idx() */
+ return;
+}
+
+static void virtio_split_packed_update_used_idx(VirtIODevice *vdev, int n)
+{
+ RCU_READ_LOCK_GUARD();
+ if (vdev->vq[n].vring.desc) {
+ vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]);
+ }
+}
+
+void virtio_queue_update_used_idx(VirtIODevice *vdev, int n)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ return virtio_queue_packed_update_used_idx(vdev, n);
+ } else {
+ return virtio_split_packed_update_used_idx(vdev, n);
+ }
+}
+
+void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n)
+{
+ vdev->vq[n].signalled_used_valid = false;
+}
+
+VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n)
+{
+ return vdev->vq + n;
+}
+
+uint16_t virtio_get_queue_index(VirtQueue *vq)
+{
+ return vq->queue_index;
+}
+
+static void virtio_queue_guest_notifier_read(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
+ if (event_notifier_test_and_clear(n)) {
+ virtio_irq(vq);
+ }
+}
+
+void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign,
+ bool with_irqfd)
+{
+ if (assign && !with_irqfd) {
+ event_notifier_set_handler(&vq->guest_notifier,
+ virtio_queue_guest_notifier_read);
+ } else {
+ event_notifier_set_handler(&vq->guest_notifier, NULL);
+ }
+ if (!assign) {
+ /* Test and clear notifier before closing it,
+ * in case poll callback didn't have time to run. */
+ virtio_queue_guest_notifier_read(&vq->guest_notifier);
+ }
+}
+
+EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq)
+{
+ return &vq->guest_notifier;
+}
+
+static void virtio_queue_host_notifier_aio_poll_begin(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+
+ virtio_queue_set_notification(vq, 0);
+}
+
+static bool virtio_queue_host_notifier_aio_poll(void *opaque)
+{
+ EventNotifier *n = opaque;
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+
+ return vq->vring.desc && !virtio_queue_empty(vq);
+}
+
+static void virtio_queue_host_notifier_aio_poll_ready(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+
+ virtio_queue_notify_vq(vq);
+}
+
+static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+
+ /* Caller polls once more after this to catch requests that race with us */
+ virtio_queue_set_notification(vq, 1);
+}
+
+void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
+{
+ aio_set_event_notifier(ctx, &vq->host_notifier, true,
+ virtio_queue_host_notifier_read,
+ virtio_queue_host_notifier_aio_poll,
+ virtio_queue_host_notifier_aio_poll_ready);
+ aio_set_event_notifier_poll(ctx, &vq->host_notifier,
+ virtio_queue_host_notifier_aio_poll_begin,
+ virtio_queue_host_notifier_aio_poll_end);
+}
+
+/*
+ * Same as virtio_queue_aio_attach_host_notifier() but without polling. Use
+ * this for rx virtqueues and similar cases where the virtqueue handler
+ * function does not pop all elements. When the virtqueue is left non-empty
+ * polling consumes CPU cycles and should not be used.
+ */
+void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx)
+{
+ aio_set_event_notifier(ctx, &vq->host_notifier, true,
+ virtio_queue_host_notifier_read,
+ NULL, NULL);
+}
+
+void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx)
+{
+ aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL, NULL);
+ /* Test and clear notifier before after disabling event,
+ * in case poll callback didn't have time to run. */
+ virtio_queue_host_notifier_read(&vq->host_notifier);
+}
+
+void virtio_queue_host_notifier_read(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+ if (event_notifier_test_and_clear(n)) {
+ virtio_queue_notify_vq(vq);
+ }
+}
+
+EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
+{
+ return &vq->host_notifier;
+}
+
+void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled)
+{
+ vq->host_notifier_enabled = enabled;
+}
+
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+ MemoryRegion *mr, bool assign)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (k->set_host_notifier_mr) {
+ return k->set_host_notifier_mr(qbus->parent, n, mr, assign);
+ }
+
+ return -1;
+}
+
+void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
+{
+ g_free(vdev->bus_name);
+ vdev->bus_name = g_strdup(bus_name);
+}
+
+void G_GNUC_PRINTF(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_vreport(fmt, ap);
+ va_end(ap);
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ vdev->status = vdev->status | VIRTIO_CONFIG_S_NEEDS_RESET;
+ virtio_notify_config(vdev);
+ }
+
+ vdev->broken = true;
+}
+
+static void virtio_memory_listener_commit(MemoryListener *listener)
+{
+ VirtIODevice *vdev = container_of(listener, VirtIODevice, listener);
+ int i;
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num == 0) {
+ break;
+ }
+ virtio_init_region_cache(vdev, i);
+ }
+}
+
+static void virtio_device_realize(DeviceState *dev, Error **errp)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
+ Error *err = NULL;
+
+ /* Devices should either use vmsd or the load/save methods */
+ assert(!vdc->vmsd || !vdc->load);
+
+ if (vdc->realize != NULL) {
+ vdc->realize(dev, &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ return;
+ }
+ }
+
+ virtio_bus_device_plugged(vdev, &err);
+ if (err != NULL) {
+ error_propagate(errp, err);
+ vdc->unrealize(dev);
+ return;
+ }
+
+ vdev->listener.commit = virtio_memory_listener_commit;
+ vdev->listener.name = "virtio";
+ memory_listener_register(&vdev->listener, vdev->dma_as);
+ QTAILQ_INSERT_TAIL(&virtio_list, vdev, next);
+}
+
+static void virtio_device_unrealize(DeviceState *dev)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
+
+ memory_listener_unregister(&vdev->listener);
+ virtio_bus_device_unplugged(vdev);
+
+ if (vdc->unrealize != NULL) {
+ vdc->unrealize(dev);
+ }
+
+ QTAILQ_REMOVE(&virtio_list, vdev, next);
+ g_free(vdev->bus_name);
+ vdev->bus_name = NULL;
+}
+
+static void virtio_device_free_virtqueues(VirtIODevice *vdev)
+{
+ int i;
+ if (!vdev->vq) {
+ return;
+ }
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ if (vdev->vq[i].vring.num == 0) {
+ break;
+ }
+ virtio_virtqueue_reset_region_cache(&vdev->vq[i]);
+ }
+ g_free(vdev->vq);
+}
+
+static void virtio_device_instance_finalize(Object *obj)
+{
+ VirtIODevice *vdev = VIRTIO_DEVICE(obj);
+
+ virtio_device_free_virtqueues(vdev);
+
+ g_free(vdev->config);
+ g_free(vdev->vector_queues);
+}
+
+static Property virtio_properties[] = {
+ DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
+ DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
+ DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true),
+ DEFINE_PROP_BOOL("x-disable-legacy-check", VirtIODevice,
+ disable_legacy_check, false),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev)
+{
+ VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ int i, n, r, err;
+
+ /*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+ memory_region_transaction_begin();
+ for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
+ VirtQueue *vq = &vdev->vq[n];
+ if (!virtio_queue_get_num(vdev, n)) {
+ continue;
+ }
+ r = virtio_bus_set_host_notifier(qbus, n, true);
+ if (r < 0) {
+ err = r;
+ goto assign_error;
+ }
+ event_notifier_set_handler(&vq->host_notifier,
+ virtio_queue_host_notifier_read);
+ }
+
+ for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
+ /* Kick right away to begin processing requests already in vring */
+ VirtQueue *vq = &vdev->vq[n];
+ if (!vq->vring.num) {
+ continue;
+ }
+ event_notifier_set(&vq->host_notifier);
+ }
+ memory_region_transaction_commit();
+ return 0;
+
+assign_error:
+ i = n; /* save n for a second iteration after transaction is committed. */
+ while (--n >= 0) {
+ VirtQueue *vq = &vdev->vq[n];
+ if (!virtio_queue_get_num(vdev, n)) {
+ continue;
+ }
+
+ event_notifier_set_handler(&vq->host_notifier, NULL);
+ r = virtio_bus_set_host_notifier(qbus, n, false);
+ assert(r >= 0);
+ }
+ /*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+ memory_region_transaction_commit();
+
+ while (--i >= 0) {
+ if (!virtio_queue_get_num(vdev, i)) {
+ continue;
+ }
+ virtio_bus_cleanup_host_notifier(qbus, i);
+ }
+ return err;
+}
+
+int virtio_device_start_ioeventfd(VirtIODevice *vdev)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusState *vbus = VIRTIO_BUS(qbus);
+
+ return virtio_bus_start_ioeventfd(vbus);
+}
+
+static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev)
+{
+ VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ int n, r;
+
+ /*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+ memory_region_transaction_begin();
+ for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
+ VirtQueue *vq = &vdev->vq[n];
+
+ if (!virtio_queue_get_num(vdev, n)) {
+ continue;
+ }
+ event_notifier_set_handler(&vq->host_notifier, NULL);
+ r = virtio_bus_set_host_notifier(qbus, n, false);
+ assert(r >= 0);
+ }
+ /*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+ memory_region_transaction_commit();
+
+ for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
+ if (!virtio_queue_get_num(vdev, n)) {
+ continue;
+ }
+ virtio_bus_cleanup_host_notifier(qbus, n);
+ }
+}
+
+int virtio_device_grab_ioeventfd(VirtIODevice *vdev)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusState *vbus = VIRTIO_BUS(qbus);
+
+ return virtio_bus_grab_ioeventfd(vbus);
+}
+
+void virtio_device_release_ioeventfd(VirtIODevice *vdev)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusState *vbus = VIRTIO_BUS(qbus);
+
+ virtio_bus_release_ioeventfd(vbus);
+}
+
+static void virtio_device_class_init(ObjectClass *klass, void *data)
+{
+ /* Set the default value here. */
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+ DeviceClass *dc = DEVICE_CLASS(klass);
+
+ dc->realize = virtio_device_realize;
+ dc->unrealize = virtio_device_unrealize;
+ dc->bus_type = TYPE_VIRTIO_BUS;
+ device_class_set_props(dc, virtio_properties);
+ vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl;
+ vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl;
+
+ vdc->legacy_features |= VIRTIO_LEGACY_FEATURES;
+
+ QTAILQ_INIT(&virtio_list);
+}
+
+bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusState *vbus = VIRTIO_BUS(qbus);
+
+ return virtio_bus_ioeventfd_enabled(vbus);
+}
+
+VirtioInfoList *qmp_x_query_virtio(Error **errp)
+{
+ VirtioInfoList *list = NULL;
+ VirtioInfoList *node;
+ VirtIODevice *vdev;
+
+ QTAILQ_FOREACH(vdev, &virtio_list, next) {
+ DeviceState *dev = DEVICE(vdev);
+ Error *err = NULL;
+ QObject *obj = qmp_qom_get(dev->canonical_path, "realized", &err);
+
+ if (err == NULL) {
+ GString *is_realized = qobject_to_json_pretty(obj, true);
+ /* virtio device is NOT realized, remove it from list */
+ if (!strncmp(is_realized->str, "false", 4)) {
+ QTAILQ_REMOVE(&virtio_list, vdev, next);
+ } else {
+ node = g_new0(VirtioInfoList, 1);
+ node->value = g_new(VirtioInfo, 1);
+ node->value->path = g_strdup(dev->canonical_path);
+ node->value->name = g_strdup(vdev->name);
+ QAPI_LIST_PREPEND(list, node->value);
+ }
+ g_string_free(is_realized, true);
+ }
+ qobject_unref(obj);
+ }
+
+ return list;
+}
+
+static VirtIODevice *virtio_device_find(const char *path)
+{
+ VirtIODevice *vdev;
+
+ QTAILQ_FOREACH(vdev, &virtio_list, next) {
+ DeviceState *dev = DEVICE(vdev);
+
+ if (strcmp(dev->canonical_path, path) != 0) {
+ continue;
+ }
+
+ Error *err = NULL;
+ QObject *obj = qmp_qom_get(dev->canonical_path, "realized", &err);
+ if (err == NULL) {
+ GString *is_realized = qobject_to_json_pretty(obj, true);
+ /* virtio device is NOT realized, remove it from list */
+ if (!strncmp(is_realized->str, "false", 4)) {
+ g_string_free(is_realized, true);
+ qobject_unref(obj);
+ QTAILQ_REMOVE(&virtio_list, vdev, next);
+ return NULL;
+ }
+ g_string_free(is_realized, true);
+ } else {
+ /* virtio device doesn't exist in QOM tree */
+ QTAILQ_REMOVE(&virtio_list, vdev, next);
+ qobject_unref(obj);
+ return NULL;
+ }
+ /* device exists in QOM tree & is realized */
+ qobject_unref(obj);
+ return vdev;
+ }
+ return NULL;
+}
+
+#define CONVERT_FEATURES(type, map, is_status, bitmap) \
+ ({ \
+ type *list = NULL; \
+ type *node; \
+ for (i = 0; map[i].virtio_bit != -1; i++) { \
+ if (is_status) { \
+ bit = map[i].virtio_bit; \
+ } \
+ else { \
+ bit = 1ULL << map[i].virtio_bit; \
+ } \
+ if ((bitmap & bit) == 0) { \
+ continue; \
+ } \
+ node = g_new0(type, 1); \
+ node->value = g_strdup(map[i].feature_desc); \
+ node->next = list; \
+ list = node; \
+ bitmap ^= bit; \
+ } \
+ list; \
+ })
+
+static VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap)
+{
+ VirtioDeviceStatus *status;
+ uint8_t bit;
+ int i;
+
+ status = g_new0(VirtioDeviceStatus, 1);
+ status->statuses = CONVERT_FEATURES(strList, virtio_config_status_map,
+ 1, bitmap);
+ status->has_unknown_statuses = bitmap != 0;
+ if (status->has_unknown_statuses) {
+ status->unknown_statuses = bitmap;
+ }
+
+ return status;
+}
+
+static VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap)
+{
+ VhostDeviceProtocols *vhu_protocols;
+ uint64_t bit;
+ int i;
+
+ vhu_protocols = g_new0(VhostDeviceProtocols, 1);
+ vhu_protocols->protocols =
+ CONVERT_FEATURES(strList,
+ vhost_user_protocol_map, 0, bitmap);
+ vhu_protocols->has_unknown_protocols = bitmap != 0;
+ if (vhu_protocols->has_unknown_protocols) {
+ vhu_protocols->unknown_protocols = bitmap;
+ }
+
+ return vhu_protocols;
+}
+
+static VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id,
+ uint64_t bitmap)
+{
+ VirtioDeviceFeatures *features;
+ uint64_t bit;
+ int i;
+
+ features = g_new0(VirtioDeviceFeatures, 1);
+ features->has_dev_features = true;
+
+ /* transport features */
+ features->transports = CONVERT_FEATURES(strList, virtio_transport_map, 0,
+ bitmap);
+
+ /* device features */
+ switch (device_id) {
+#ifdef CONFIG_VIRTIO_SERIAL
+ case VIRTIO_ID_CONSOLE:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_serial_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_BLK
+ case VIRTIO_ID_BLOCK:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_blk_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_GPU
+ case VIRTIO_ID_GPU:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_gpu_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_NET
+ case VIRTIO_ID_NET:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_net_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_SCSI
+ case VIRTIO_ID_SCSI:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_scsi_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_BALLOON
+ case VIRTIO_ID_BALLOON:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_balloon_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_IOMMU
+ case VIRTIO_ID_IOMMU:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_iommu_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_INPUT
+ case VIRTIO_ID_INPUT:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_input_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VHOST_USER_FS
+ case VIRTIO_ID_FS:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_fs_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VHOST_VSOCK
+ case VIRTIO_ID_VSOCK:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_vsock_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_CRYPTO
+ case VIRTIO_ID_CRYPTO:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_crypto_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_MEM
+ case VIRTIO_ID_MEM:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_mem_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_I2C_ADAPTER
+ case VIRTIO_ID_I2C_ADAPTER:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_i2c_feature_map, 0, bitmap);
+ break;
+#endif
+#ifdef CONFIG_VIRTIO_RNG
+ case VIRTIO_ID_RNG:
+ features->dev_features =
+ CONVERT_FEATURES(strList, virtio_rng_feature_map, 0, bitmap);
+ break;
+#endif
+ /* No features */
+ case VIRTIO_ID_9P:
+ case VIRTIO_ID_PMEM:
+ case VIRTIO_ID_IOMEM:
+ case VIRTIO_ID_RPMSG:
+ case VIRTIO_ID_CLOCK:
+ case VIRTIO_ID_MAC80211_WLAN:
+ case VIRTIO_ID_MAC80211_HWSIM:
+ case VIRTIO_ID_RPROC_SERIAL:
+ case VIRTIO_ID_MEMORY_BALLOON:
+ case VIRTIO_ID_CAIF:
+ case VIRTIO_ID_SIGNAL_DIST:
+ case VIRTIO_ID_PSTORE:
+ case VIRTIO_ID_SOUND:
+ case VIRTIO_ID_BT:
+ case VIRTIO_ID_RPMB:
+ case VIRTIO_ID_VIDEO_ENCODER:
+ case VIRTIO_ID_VIDEO_DECODER:
+ case VIRTIO_ID_SCMI:
+ case VIRTIO_ID_NITRO_SEC_MOD:
+ case VIRTIO_ID_WATCHDOG:
+ case VIRTIO_ID_CAN:
+ case VIRTIO_ID_DMABUF:
+ case VIRTIO_ID_PARAM_SERV:
+ case VIRTIO_ID_AUDIO_POLICY:
+ case VIRTIO_ID_GPIO:
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ features->has_unknown_dev_features = bitmap != 0;
+ if (features->has_unknown_dev_features) {
+ features->unknown_dev_features = bitmap;
+ }
+
+ return features;
+}
+
+VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp)
+{
+ VirtIODevice *vdev;
+ VirtioStatus *status;
+
+ vdev = virtio_device_find(path);
+ if (vdev == NULL) {
+ error_setg(errp, "Path %s is not a VirtIODevice", path);
+ return NULL;
+ }
+
+ status = g_new0(VirtioStatus, 1);
+ status->name = g_strdup(vdev->name);
+ status->device_id = vdev->device_id;
+ status->vhost_started = vdev->vhost_started;
+ status->guest_features = qmp_decode_features(vdev->device_id,
+ vdev->guest_features);
+ status->host_features = qmp_decode_features(vdev->device_id,
+ vdev->host_features);
+ status->backend_features = qmp_decode_features(vdev->device_id,
+ vdev->backend_features);
+
+ switch (vdev->device_endian) {
+ case VIRTIO_DEVICE_ENDIAN_LITTLE:
+ status->device_endian = g_strdup("little");
+ break;
+ case VIRTIO_DEVICE_ENDIAN_BIG:
+ status->device_endian = g_strdup("big");
+ break;
+ default:
+ status->device_endian = g_strdup("unknown");
+ break;
+ }
+
+ status->num_vqs = virtio_get_num_queues(vdev);
+ status->status = qmp_decode_status(vdev->status);
+ status->isr = vdev->isr;
+ status->queue_sel = vdev->queue_sel;
+ status->vm_running = vdev->vm_running;
+ status->broken = vdev->broken;
+ status->disabled = vdev->disabled;
+ status->use_started = vdev->use_started;
+ status->started = vdev->started;
+ status->start_on_kick = vdev->start_on_kick;
+ status->disable_legacy_check = vdev->disable_legacy_check;
+ status->bus_name = g_strdup(vdev->bus_name);
+ status->use_guest_notifier_mask = vdev->use_guest_notifier_mask;
+ status->has_vhost_dev = vdev->vhost_started;
+
+ if (vdev->vhost_started) {
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ struct vhost_dev *hdev = vdc->get_vhost(vdev);
+
+ status->vhost_dev = g_new0(VhostStatus, 1);
+ status->vhost_dev->n_mem_sections = hdev->n_mem_sections;
+ status->vhost_dev->n_tmp_sections = hdev->n_tmp_sections;
+ status->vhost_dev->nvqs = hdev->nvqs;
+ status->vhost_dev->vq_index = hdev->vq_index;
+ status->vhost_dev->features =
+ qmp_decode_features(vdev->device_id, hdev->features);
+ status->vhost_dev->acked_features =
+ qmp_decode_features(vdev->device_id, hdev->acked_features);
+ status->vhost_dev->backend_features =
+ qmp_decode_features(vdev->device_id, hdev->backend_features);
+ status->vhost_dev->protocol_features =
+ qmp_decode_protocols(hdev->protocol_features);
+ status->vhost_dev->max_queues = hdev->max_queues;
+ status->vhost_dev->backend_cap = hdev->backend_cap;
+ status->vhost_dev->log_enabled = hdev->log_enabled;
+ status->vhost_dev->log_size = hdev->log_size;
+ }
+
+ return status;
+}
+
+VirtVhostQueueStatus *qmp_x_query_virtio_vhost_queue_status(const char *path,
+ uint16_t queue,
+ Error **errp)
+{
+ VirtIODevice *vdev;
+ VirtVhostQueueStatus *status;
+
+ vdev = virtio_device_find(path);
+ if (vdev == NULL) {
+ error_setg(errp, "Path %s is not a VirtIODevice", path);
+ return NULL;
+ }
+
+ if (!vdev->vhost_started) {
+ error_setg(errp, "Error: vhost device has not started yet");
+ return NULL;
+ }
+
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ struct vhost_dev *hdev = vdc->get_vhost(vdev);
+
+ if (queue < hdev->vq_index || queue >= hdev->vq_index + hdev->nvqs) {
+ error_setg(errp, "Invalid vhost virtqueue number %d", queue);
+ return NULL;
+ }
+
+ status = g_new0(VirtVhostQueueStatus, 1);
+ status->name = g_strdup(vdev->name);
+ status->kick = hdev->vqs[queue].kick;
+ status->call = hdev->vqs[queue].call;
+ status->desc = (uintptr_t)hdev->vqs[queue].desc;
+ status->avail = (uintptr_t)hdev->vqs[queue].avail;
+ status->used = (uintptr_t)hdev->vqs[queue].used;
+ status->num = hdev->vqs[queue].num;
+ status->desc_phys = hdev->vqs[queue].desc_phys;
+ status->desc_size = hdev->vqs[queue].desc_size;
+ status->avail_phys = hdev->vqs[queue].avail_phys;
+ status->avail_size = hdev->vqs[queue].avail_size;
+ status->used_phys = hdev->vqs[queue].used_phys;
+ status->used_size = hdev->vqs[queue].used_size;
+
+ return status;
+}
+
+VirtQueueStatus *qmp_x_query_virtio_queue_status(const char *path,
+ uint16_t queue,
+ Error **errp)
+{
+ VirtIODevice *vdev;
+ VirtQueueStatus *status;
+
+ vdev = virtio_device_find(path);
+ if (vdev == NULL) {
+ error_setg(errp, "Path %s is not a VirtIODevice", path);
+ return NULL;
+ }
+
+ if (queue >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, queue)) {
+ error_setg(errp, "Invalid virtqueue number %d", queue);
+ return NULL;
+ }
+
+ status = g_new0(VirtQueueStatus, 1);
+ status->name = g_strdup(vdev->name);
+ status->queue_index = vdev->vq[queue].queue_index;
+ status->inuse = vdev->vq[queue].inuse;
+ status->vring_num = vdev->vq[queue].vring.num;
+ status->vring_num_default = vdev->vq[queue].vring.num_default;
+ status->vring_align = vdev->vq[queue].vring.align;
+ status->vring_desc = vdev->vq[queue].vring.desc;
+ status->vring_avail = vdev->vq[queue].vring.avail;
+ status->vring_used = vdev->vq[queue].vring.used;
+ status->used_idx = vdev->vq[queue].used_idx;
+ status->signalled_used = vdev->vq[queue].signalled_used;
+ status->signalled_used_valid = vdev->vq[queue].signalled_used_valid;
+
+ if (vdev->vhost_started) {
+ VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
+ struct vhost_dev *hdev = vdc->get_vhost(vdev);
+
+ /* check if vq index exists for vhost as well */
+ if (queue >= hdev->vq_index && queue < hdev->vq_index + hdev->nvqs) {
+ status->has_last_avail_idx = true;
+
+ int vhost_vq_index =
+ hdev->vhost_ops->vhost_get_vq_index(hdev, queue);
+ struct vhost_vring_state state = {
+ .index = vhost_vq_index,
+ };
+
+ status->last_avail_idx =
+ hdev->vhost_ops->vhost_get_vring_base(hdev, &state);
+ }
+ } else {
+ status->has_shadow_avail_idx = true;
+ status->has_last_avail_idx = true;
+ status->last_avail_idx = vdev->vq[queue].last_avail_idx;
+ status->shadow_avail_idx = vdev->vq[queue].shadow_avail_idx;
+ }
+
+ return status;
+}
+
+static strList *qmp_decode_vring_desc_flags(uint16_t flags)
+{
+ strList *list = NULL;
+ strList *node;
+ int i;
+
+ struct {
+ uint16_t flag;
+ const char *value;
+ } map[] = {
+ { VRING_DESC_F_NEXT, "next" },
+ { VRING_DESC_F_WRITE, "write" },
+ { VRING_DESC_F_INDIRECT, "indirect" },
+ { 1 << VRING_PACKED_DESC_F_AVAIL, "avail" },
+ { 1 << VRING_PACKED_DESC_F_USED, "used" },
+ { 0, "" }
+ };
+
+ for (i = 0; map[i].flag; i++) {
+ if ((map[i].flag & flags) == 0) {
+ continue;
+ }
+ node = g_malloc0(sizeof(strList));
+ node->value = g_strdup(map[i].value);
+ node->next = list;
+ list = node;
+ }
+
+ return list;
+}
+
+VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path,
+ uint16_t queue,
+ bool has_index,
+ uint16_t index,
+ Error **errp)
+{
+ VirtIODevice *vdev;
+ VirtQueue *vq;
+ VirtioQueueElement *element = NULL;
+
+ vdev = virtio_device_find(path);
+ if (vdev == NULL) {
+ error_setg(errp, "Path %s is not a VirtIO device", path);
+ return NULL;
+ }
+
+ if (queue >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, queue)) {
+ error_setg(errp, "Invalid virtqueue number %d", queue);
+ return NULL;
+ }
+ vq = &vdev->vq[queue];
+
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) {
+ error_setg(errp, "Packed ring not supported");
+ return NULL;
+ } else {
+ unsigned int head, i, max;
+ VRingMemoryRegionCaches *caches;
+ MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
+ MemoryRegionCache *desc_cache;
+ VRingDesc desc;
+ VirtioRingDescList *list = NULL;
+ VirtioRingDescList *node;
+ int rc; int ndescs;
+
+ RCU_READ_LOCK_GUARD();
+
+ max = vq->vring.num;
+
+ if (!has_index) {
+ head = vring_avail_ring(vq, vq->last_avail_idx % vq->vring.num);
+ } else {
+ head = vring_avail_ring(vq, index % vq->vring.num);
+ }
+ i = head;
+
+ caches = vring_get_region_caches(vq);
+ if (!caches) {
+ error_setg(errp, "Region caches not initialized");
+ return NULL;
+ }
+ if (caches->desc.len < max * sizeof(VRingDesc)) {
+ error_setg(errp, "Cannot map descriptor ring");
+ return NULL;
+ }
+
+ desc_cache = &caches->desc;
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ int64_t len;
+ len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as,
+ desc.addr, desc.len, false);
+ desc_cache = &indirect_desc_cache;
+ if (len < desc.len) {
+ error_setg(errp, "Cannot map indirect buffer");
+ goto done;
+ }
+
+ max = desc.len / sizeof(VRingDesc);
+ i = 0;
+ vring_split_desc_read(vdev, &desc, desc_cache, i);
+ }
+
+ element = g_new0(VirtioQueueElement, 1);
+ element->avail = g_new0(VirtioRingAvail, 1);
+ element->used = g_new0(VirtioRingUsed, 1);
+ element->name = g_strdup(vdev->name);
+ element->index = head;
+ element->avail->flags = vring_avail_flags(vq);
+ element->avail->idx = vring_avail_idx(vq);
+ element->avail->ring = head;
+ element->used->flags = vring_used_flags(vq);
+ element->used->idx = vring_used_idx(vq);
+ ndescs = 0;
+
+ do {
+ /* A buggy driver may produce an infinite loop */
+ if (ndescs >= max) {
+ break;
+ }
+ node = g_new0(VirtioRingDescList, 1);
+ node->value = g_new0(VirtioRingDesc, 1);
+ node->value->addr = desc.addr;
+ node->value->len = desc.len;
+ node->value->flags = qmp_decode_vring_desc_flags(desc.flags);
+ node->next = list;
+ list = node;
+
+ ndescs++;
+ rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache,
+ max, &i);
+ } while (rc == VIRTQUEUE_READ_DESC_MORE);
+ element->descs = list;
+done:
+ address_space_cache_destroy(&indirect_desc_cache);
+ }
+
+ return element;
+}
+
+static const TypeInfo virtio_device_info = {
+ .name = TYPE_VIRTIO_DEVICE,
+ .parent = TYPE_DEVICE,
+ .instance_size = sizeof(VirtIODevice),
+ .class_init = virtio_device_class_init,
+ .instance_finalize = virtio_device_instance_finalize,
+ .abstract = true,
+ .class_size = sizeof(VirtioDeviceClass),
+};
+
+static void virtio_register_types(void)
+{
+ type_register_static(&virtio_device_info);
+}
+
+type_init(virtio_register_types)