diff options
Diffstat (limited to 'hw/virtio')
56 files changed, 26429 insertions, 0 deletions
diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig new file mode 100644 index 00000000..cbfd8c71 --- /dev/null +++ b/hw/virtio/Kconfig @@ -0,0 +1,87 @@ +config VIRTIO + bool + +config VIRTIO_RNG + bool + default y + depends on VIRTIO + +config VIRTIO_IOMMU + bool + default y + depends on PCI && VIRTIO + +config VIRTIO_PCI + bool + default y if PCI_DEVICES + depends on PCI + select VIRTIO + +config VIRTIO_MMIO + bool + select VIRTIO + +config VIRTIO_CCW + bool + select VIRTIO + +config VIRTIO_BALLOON + bool + default y + depends on VIRTIO + +config VIRTIO_CRYPTO + bool + default y + depends on VIRTIO + +config VIRTIO_PMEM_SUPPORTED + bool + +config VIRTIO_PMEM + bool + default y + depends on VIRTIO + depends on VIRTIO_PMEM_SUPPORTED + select MEM_DEVICE + +config VIRTIO_MEM_SUPPORTED + bool + +config VIRTIO_MEM + bool + default y + depends on VIRTIO + depends on LINUX + depends on VIRTIO_MEM_SUPPORTED + select MEM_DEVICE + +config VHOST_VSOCK + bool + default y + depends on VIRTIO && VHOST_KERNEL + +config VHOST_USER_VSOCK + bool + default y + depends on VIRTIO && VHOST_USER + +config VHOST_USER_I2C + bool + default y + depends on VIRTIO && VHOST_USER + +config VHOST_USER_RNG + bool + default y + depends on VIRTIO && VHOST_USER + +config VHOST_USER_FS + bool + default y + depends on VIRTIO && VHOST_USER + +config VHOST_USER_GPIO + bool + default y + depends on VIRTIO && VHOST_USER diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build new file mode 100644 index 00000000..dfed1e7a --- /dev/null +++ b/hw/virtio/meson.build @@ -0,0 +1,67 @@ +softmmu_virtio_ss = ss.source_set() +softmmu_virtio_ss.add(files('virtio-bus.c')) +softmmu_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: files('virtio-pci.c')) +softmmu_virtio_ss.add(when: 'CONFIG_VIRTIO_MMIO', if_true: files('virtio-mmio.c')) + +virtio_ss = ss.source_set() +virtio_ss.add(files('virtio.c')) + +if have_vhost + virtio_ss.add(files('vhost.c', 'vhost-backend.c', 'vhost-iova-tree.c')) + if have_vhost_user + virtio_ss.add(files('vhost-user.c')) + endif + if have_vhost_vdpa + virtio_ss.add(files('vhost-vdpa.c', 'vhost-shadow-virtqueue.c')) + endif +else + softmmu_virtio_ss.add(files('vhost-stub.c')) +endif + +virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c')) +virtio_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs.c')) +virtio_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: files('virtio-pmem.c')) +virtio_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock.c', 'vhost-vsock-common.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock.c', 'vhost-vsock-common.c')) +virtio_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng.c')) +virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu.c')) +virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng.c')) +virtio_ss.add(when: 'CONFIG_VHOST_USER_GPIO', if_true: files('vhost-user-gpio.c')) +virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_GPIO'], if_true: files('vhost-user-gpio-pci.c')) + +virtio_pci_ss = ss.source_set() +virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: files('vhost-vsock-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_VSOCK', if_true: files('vhost-user-vsock-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: files('vhost-user-i2c-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_INPUT', if_true: files('vhost-user-input-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: files('vhost-user-rng-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_SCSI', if_true: files('vhost-user-scsi-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_SCSI', if_true: files('vhost-scsi-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VHOST_USER_FS', if_true: files('vhost-user-fs-pci.c')) + +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('virtio-crypto-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT_HOST', if_true: files('virtio-input-host-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_INPUT', if_true: files('virtio-input-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_RNG', if_true: files('virtio-rng-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_9P', if_true: files('virtio-9p-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SCSI', if_true: files('virtio-scsi-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('virtio-net-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_SERIAL', if_true: files('virtio-serial-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_PMEM', if_true: files('virtio-pmem-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: files('virtio-iommu-pci.c')) +virtio_pci_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem-pci.c')) + +virtio_ss.add_all(when: 'CONFIG_VIRTIO_PCI', if_true: virtio_pci_ss) + +specific_ss.add_all(when: 'CONFIG_VIRTIO', if_true: virtio_ss) +softmmu_ss.add_all(when: 'CONFIG_VIRTIO', if_true: softmmu_virtio_ss) +softmmu_ss.add(when: 'CONFIG_VIRTIO', if_false: files('vhost-stub.c')) +softmmu_ss.add(when: 'CONFIG_VIRTIO', if_false: files('virtio-stub.c')) +softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c')) +softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('virtio-stub.c')) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events new file mode 100644 index 00000000..14fc5b9b --- /dev/null +++ b/hw/virtio/trace-events @@ -0,0 +1,151 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# vhost.c +vhost_commit(bool started, bool changed) "Started: %d Changed: %d" +vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 +vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64 +vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 +vhost_section(const char *name) "%s" +vhost_reject_section(const char *name, int d) "%s:%d" +vhost_iotlb_miss(void *dev, int step) "%p step %d" +vhost_dev_cleanup(void *dev) "%p" +vhost_dev_start(void *dev, const char *name, bool vrings) "%p:%s vrings:%d" +vhost_dev_stop(void *dev, const char *name, bool vrings) "%p:%s vrings:%d" + + +# vhost-user.c +vhost_user_postcopy_end_entry(void) "" +vhost_user_postcopy_end_exit(void) "" +vhost_user_postcopy_fault_handler(const char *name, uint64_t fault_address, int nregions) "%s: @0x%"PRIx64" nregions:%d" +vhost_user_postcopy_fault_handler_loop(int i, uint64_t client_base, uint64_t size) "%d: client 0x%"PRIx64" +0x%"PRIx64 +vhost_user_postcopy_fault_handler_found(int i, uint64_t region_offset, uint64_t rb_offset) "%d: region_offset: 0x%"PRIx64" rb_offset:0x%"PRIx64 +vhost_user_postcopy_listen(void) "" +vhost_user_set_mem_table_postcopy(uint64_t client_addr, uint64_t qhva, int reply_i, int region_i) "client:0x%"PRIx64" for hva: 0x%"PRIx64" reply %d region %d" +vhost_user_set_mem_table_withfd(int index, const char *name, uint64_t memory_size, uint64_t guest_phys_addr, uint64_t userspace_addr, uint64_t offset) "%d:%s: size:0x%"PRIx64" GPA:0x%"PRIx64" QVA/userspace:0x%"PRIx64" RB offset:0x%"PRIx64 +vhost_user_postcopy_waker(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 +vhost_user_postcopy_waker_found(uint64_t client_addr) "0x%"PRIx64 +vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 +vhost_user_read(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" +vhost_user_write(uint32_t req, uint32_t flags) "req:%d flags:0x%"PRIx32"" +vhost_user_create_notifier(int idx, void *n) "idx:%d n:%p" + +# vhost-vdpa.c +vhost_vdpa_dma_map(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint64_t uaddr, uint8_t perm, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" uaddr: 0x%"PRIx64" perm: 0x%"PRIx8" type: %"PRIu8 +vhost_vdpa_dma_unmap(void *vdpa, int fd, uint32_t msg_type, uint64_t iova, uint64_t size, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" iova: 0x%"PRIx64" size: 0x%"PRIx64" type: %"PRIu8 +vhost_vdpa_listener_begin_batch(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 +vhost_vdpa_listener_commit(void *v, int fd, uint32_t msg_type, uint8_t type) "vdpa:%p fd: %d msg_type: %"PRIu32" type: %"PRIu8 +vhost_vdpa_listener_region_add(void *vdpa, uint64_t iova, uint64_t llend, void *vaddr, bool readonly) "vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64" vaddr: %p read-only: %d" +vhost_vdpa_listener_region_del(void *vdpa, uint64_t iova, uint64_t llend) "vdpa: %p iova 0x%"PRIx64" llend 0x%"PRIx64 +vhost_vdpa_add_status(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8 +vhost_vdpa_init(void *dev, void *vdpa) "dev: %p vdpa: %p" +vhost_vdpa_cleanup(void *dev, void *vdpa) "dev: %p vdpa: %p" +vhost_vdpa_memslots_limit(void *dev, int ret) "dev: %p = 0x%x" +vhost_vdpa_set_mem_table(void *dev, uint32_t nregions, uint32_t padding) "dev: %p nregions: %"PRIu32" padding: 0x%"PRIx32 +vhost_vdpa_dump_regions(void *dev, int i, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, uint64_t flags_padding) "dev: %p %d: guest_phys_addr: 0x%"PRIx64" memory_size: 0x%"PRIx64" userspace_addr: 0x%"PRIx64" flags_padding: 0x%"PRIx64 +vhost_vdpa_set_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 +vhost_vdpa_get_device_id(void *dev, uint32_t device_id) "dev: %p device_id %"PRIu32 +vhost_vdpa_reset_device(void *dev, uint8_t status) "dev: %p status: 0x%"PRIx8 +vhost_vdpa_get_vq_index(void *dev, int idx, int vq_idx) "dev: %p idx: %d vq idx: %d" +vhost_vdpa_set_vring_ready(void *dev) "dev: %p" +vhost_vdpa_dump_config(void *dev, const char *line) "dev: %p %s" +vhost_vdpa_set_config(void *dev, uint32_t offset, uint32_t size, uint32_t flags) "dev: %p offset: %"PRIu32" size: %"PRIu32" flags: 0x%"PRIx32 +vhost_vdpa_get_config(void *dev, void *config, uint32_t config_len) "dev: %p config: %p config_len: %"PRIu32 +vhost_vdpa_dev_start(void *dev, bool started) "dev: %p started: %d" +vhost_vdpa_set_log_base(void *dev, uint64_t base, unsigned long long size, int refcnt, int fd, void *log) "dev: %p base: 0x%"PRIx64" size: %llu refcnt: %d fd: %d log: %p" +vhost_vdpa_set_vring_addr(void *dev, unsigned int index, unsigned int flags, uint64_t desc_user_addr, uint64_t used_user_addr, uint64_t avail_user_addr, uint64_t log_guest_addr) "dev: %p index: %u flags: 0x%x desc_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" log_guest_addr: 0x%"PRIx64 +vhost_vdpa_set_vring_num(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_set_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_get_vring_base(void *dev, unsigned int index, unsigned int num) "dev: %p index: %u num: %u" +vhost_vdpa_set_vring_kick(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" +vhost_vdpa_set_vring_call(void *dev, unsigned int index, int fd) "dev: %p index: %u fd: %d" +vhost_vdpa_get_features(void *dev, uint64_t features) "dev: %p features: 0x%"PRIx64 +vhost_vdpa_set_owner(void *dev) "dev: %p" +vhost_vdpa_vq_get_addr(void *dev, void *vq, uint64_t desc_user_addr, uint64_t avail_user_addr, uint64_t used_user_addr) "dev: %p vq: %p desc_user_addr: 0x%"PRIx64" avail_user_addr: 0x%"PRIx64" used_user_addr: 0x%"PRIx64 +vhost_vdpa_get_iova_range(void *dev, uint64_t first, uint64_t last) "dev: %p first: 0x%"PRIx64" last: 0x%"PRIx64 + +# virtio.c +virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u" +virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) "vq %p elem %p len %u idx %u" +virtqueue_flush(void *vq, unsigned int count) "vq %p count %u" +virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) "vq %p elem %p in_num %u out_num %u" +virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p" +virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p" +virtio_notify(void *vdev, void *vq) "vdev %p vq %p" +virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u" + +# virtio-rng.c +virtio_rng_guest_not_ready(void *rng) "rng %p: guest not ready" +virtio_rng_cpu_is_stopped(void *rng, int size) "rng %p: cpu is stopped, dropping %d bytes" +virtio_rng_popped(void *rng) "rng %p: elem popped" +virtio_rng_pushed(void *rng, size_t len) "rng %p: %zd bytes pushed" +virtio_rng_request(void *rng, size_t size, unsigned quota) "rng %p: %zd bytes requested, %u bytes quota left" +virtio_rng_vm_state_change(void *rng, int running, int state) "rng %p: state change to running %d state %d" + +# virtio-balloon.c +# +virtio_balloon_bad_addr(uint64_t gpa) "0x%"PRIx64 +virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s gpa: 0x%"PRIx64 +virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d" +virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d" +virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 0x%"PRIx64" num_pages: %d" + +# virtio-mmio.c +virtio_mmio_read(uint64_t offset) "virtio_mmio_read offset 0x%" PRIx64 +virtio_mmio_write_offset(uint64_t offset, uint64_t value) "virtio_mmio_write offset 0x%" PRIx64 " value 0x%" PRIx64 +virtio_mmio_guest_page(uint64_t size, int shift) "guest page size 0x%" PRIx64 " shift %d" +virtio_mmio_queue_write(uint64_t value, int max_size) "mmio_queue write 0x%" PRIx64 " max %d" +virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d" + +# virtio-pci.c +virtio_pci_notify(uint16_t vector) "virtio_pci_notify vec 0x%x" +virtio_pci_notify_write(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" +virtio_pci_notify_write_pio(uint64_t addr, uint64_t val, unsigned int size) "0x%" PRIx64" = 0x%" PRIx64 " (%d)" + +# hw/virtio/virtio-iommu.c +virtio_iommu_device_reset(void) "reset!" +virtio_iommu_system_reset(void) "system reset!" +virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 +virtio_iommu_device_status(uint8_t status) "driver status = %d" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" +virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" +virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" +virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d" +virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s" +virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d" +virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d" +virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d" +virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d" +virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d" +virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64 +virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t start, uint64_t end) "dev= %d, type=%d start=0x%"PRIx64" end=0x%"PRIx64 +virtio_iommu_notify_map(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64" flags=%d" +virtio_iommu_notify_unmap(const char *name, uint64_t virt_start, uint64_t virt_end) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 +virtio_iommu_remap(const char *name, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start) "mr=%s virt_start=0x%"PRIx64" virt_end=0x%"PRIx64" phys_start=0x%"PRIx64 +virtio_iommu_set_page_size_mask(const char *name, uint64_t old, uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 +virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" +virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" + +# virtio-mem.c +virtio_mem_send_response(uint16_t type) "type=%" PRIu16 +virtio_mem_plug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 +virtio_mem_unplug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 +virtio_mem_unplugged_all(void) "" +virtio_mem_unplug_all_request(void) "" +virtio_mem_resized_usable_region(uint64_t old_size, uint64_t new_size) "old_size=0x%" PRIx64 "new_size=0x%" PRIx64 +virtio_mem_state_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16 +virtio_mem_state_response(uint16_t state) "state=%" PRIu16 + +# virtio-pmem.c +virtio_pmem_flush_request(void) "flush request" +virtio_pmem_response(void) "flush response" +virtio_pmem_flush_done(int type) "fsync return=%d" + +# virtio-gpio.c +virtio_gpio_start(void) "start" +virtio_gpio_stop(void) "stop" +virtio_gpio_set_status(uint8_t status) "0x%x" diff --git a/hw/virtio/trace.h b/hw/virtio/trace.h new file mode 100644 index 00000000..5d709706 --- /dev/null +++ b/hw/virtio/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_virtio.h" diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c new file mode 100644 index 00000000..8e581575 --- /dev/null +++ b/hw/virtio/vhost-backend.c @@ -0,0 +1,410 @@ +/* + * vhost-backend + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-backend.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "standard-headers/linux/vhost_types.h" + +#include "hw/virtio/vhost-vdpa.h" +#ifdef CONFIG_VHOST_KERNEL +#include <linux/vhost.h> +#include <sys/ioctl.h> + +static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + int fd = (uintptr_t) dev->opaque; + int ret; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + ret = ioctl(fd, request, arg); + return ret < 0 ? -errno : ret; +} + +static int vhost_kernel_init(struct vhost_dev *dev, void *opaque, Error **errp) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + dev->opaque = opaque; + + return 0; +} + +static int vhost_kernel_cleanup(struct vhost_dev *dev) +{ + int fd = (uintptr_t) dev->opaque; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + return close(fd) < 0 ? -errno : 0; +} + +static int vhost_kernel_memslots_limit(struct vhost_dev *dev) +{ + int limit = 64; + char *s; + + if (g_file_get_contents("/sys/module/vhost/parameters/max_mem_regions", + &s, NULL, NULL)) { + uint64_t val = g_ascii_strtoull(s, NULL, 10); + if (val < INT_MAX && val > 0) { + g_free(s); + return val; + } + error_report("ignoring invalid max_mem_regions value in vhost module:" + " %s", s); + } + g_free(s); + return limit; +} + +static int vhost_kernel_net_set_backend(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_NET_SET_BACKEND, file); +} + +static int vhost_kernel_scsi_set_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_SET_ENDPOINT, target); +} + +static int vhost_kernel_scsi_clear_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_CLEAR_ENDPOINT, target); +} + +static int vhost_kernel_scsi_get_abi_version(struct vhost_dev *dev, int *version) +{ + return vhost_kernel_call(dev, VHOST_SCSI_GET_ABI_VERSION, version); +} + +static int vhost_kernel_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) +{ + return vhost_kernel_call(dev, VHOST_SET_LOG_BASE, &base); +} + +static int vhost_kernel_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem); +} + +static int vhost_kernel_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ADDR, addr); +} + +static int vhost_kernel_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ENDIAN, ring); +} + +static int vhost_kernel_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_NUM, ring); +} + +static int vhost_kernel_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_BASE, ring); +} + +static int vhost_kernel_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_GET_VRING_BASE, ring); +} + +static int vhost_kernel_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file); +} + +static int vhost_kernel_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file); +} + +static int vhost_kernel_set_vring_err(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ERR, file); +} + +static int vhost_kernel_set_vring_busyloop_timeout(struct vhost_dev *dev, + struct vhost_vring_state *s) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s); +} + +static int vhost_kernel_set_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); +} + +static int vhost_kernel_set_backend_cap(struct vhost_dev *dev) +{ + uint64_t features; + uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2; + int r; + + if (vhost_kernel_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { + return 0; + } + + features &= f; + r = vhost_kernel_call(dev, VHOST_SET_BACKEND_FEATURES, + &features); + if (r) { + return 0; + } + + dev->backend_cap = features; + + return 0; +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, + uint64_t *features) +{ + return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); +} + +static int vhost_kernel_set_owner(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL); +} + +static int vhost_kernel_reset_device(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL); +} + +static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx) +{ + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + return idx - dev->vq_index; +} + +static int vhost_kernel_vsock_set_guest_cid(struct vhost_dev *dev, + uint64_t guest_cid) +{ + return vhost_kernel_call(dev, VHOST_VSOCK_SET_GUEST_CID, &guest_cid); +} + +static int vhost_kernel_vsock_set_running(struct vhost_dev *dev, int start) +{ + return vhost_kernel_call(dev, VHOST_VSOCK_SET_RUNNING, &start); +} + +static void vhost_kernel_iotlb_read(void *opaque) +{ + struct vhost_dev *dev = opaque; + ssize_t len; + + if (dev->backend_cap & + (0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)) { + struct vhost_msg_v2 msg; + + while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) { + if (len < sizeof msg) { + error_report("Wrong vhost message len: %d", (int)len); + break; + } + if (msg.type != VHOST_IOTLB_MSG_V2) { + error_report("Unknown vhost iotlb message type"); + break; + } + + vhost_backend_handle_iotlb_msg(dev, &msg.iotlb); + } + } else { + struct vhost_msg msg; + + while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) { + if (len < sizeof msg) { + error_report("Wrong vhost message len: %d", (int)len); + break; + } + if (msg.type != VHOST_IOTLB_MSG) { + error_report("Unknown vhost iotlb message type"); + break; + } + + vhost_backend_handle_iotlb_msg(dev, &msg.iotlb); + } + } +} + +static int vhost_kernel_send_device_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *imsg) +{ + if (dev->backend_cap & (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)) { + struct vhost_msg_v2 msg = {}; + + msg.type = VHOST_IOTLB_MSG_V2; + msg.iotlb = *imsg; + + if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) { + error_report("Fail to update device iotlb"); + return -EFAULT; + } + } else { + struct vhost_msg msg = {}; + + msg.type = VHOST_IOTLB_MSG; + msg.iotlb = *imsg; + + if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) { + error_report("Fail to update device iotlb"); + return -EFAULT; + } + } + + return 0; +} + +static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev, + int enabled) +{ + if (enabled) + qemu_set_fd_handler((uintptr_t)dev->opaque, + vhost_kernel_iotlb_read, NULL, dev); + else + qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL); +} + +const VhostOps kernel_ops = { + .backend_type = VHOST_BACKEND_TYPE_KERNEL, + .vhost_backend_init = vhost_kernel_init, + .vhost_backend_cleanup = vhost_kernel_cleanup, + .vhost_backend_memslots_limit = vhost_kernel_memslots_limit, + .vhost_net_set_backend = vhost_kernel_net_set_backend, + .vhost_scsi_set_endpoint = vhost_kernel_scsi_set_endpoint, + .vhost_scsi_clear_endpoint = vhost_kernel_scsi_clear_endpoint, + .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version, + .vhost_set_log_base = vhost_kernel_set_log_base, + .vhost_set_mem_table = vhost_kernel_set_mem_table, + .vhost_set_vring_addr = vhost_kernel_set_vring_addr, + .vhost_set_vring_endian = vhost_kernel_set_vring_endian, + .vhost_set_vring_num = vhost_kernel_set_vring_num, + .vhost_set_vring_base = vhost_kernel_set_vring_base, + .vhost_get_vring_base = vhost_kernel_get_vring_base, + .vhost_set_vring_kick = vhost_kernel_set_vring_kick, + .vhost_set_vring_call = vhost_kernel_set_vring_call, + .vhost_set_vring_err = vhost_kernel_set_vring_err, + .vhost_set_vring_busyloop_timeout = + vhost_kernel_set_vring_busyloop_timeout, + .vhost_set_features = vhost_kernel_set_features, + .vhost_get_features = vhost_kernel_get_features, + .vhost_set_backend_cap = vhost_kernel_set_backend_cap, + .vhost_set_owner = vhost_kernel_set_owner, + .vhost_reset_device = vhost_kernel_reset_device, + .vhost_get_vq_index = vhost_kernel_get_vq_index, + .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid, + .vhost_vsock_set_running = vhost_kernel_vsock_set_running, + .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback, + .vhost_send_device_iotlb_msg = vhost_kernel_send_device_iotlb_msg, +}; +#endif + +int vhost_backend_update_device_iotlb(struct vhost_dev *dev, + uint64_t iova, uint64_t uaddr, + uint64_t len, + IOMMUAccessFlags perm) +{ + struct vhost_iotlb_msg imsg; + + imsg.iova = iova; + imsg.uaddr = uaddr; + imsg.size = len; + imsg.type = VHOST_IOTLB_UPDATE; + + switch (perm) { + case IOMMU_RO: + imsg.perm = VHOST_ACCESS_RO; + break; + case IOMMU_WO: + imsg.perm = VHOST_ACCESS_WO; + break; + case IOMMU_RW: + imsg.perm = VHOST_ACCESS_RW; + break; + default: + return -EINVAL; + } + + if (dev->vhost_ops && dev->vhost_ops->vhost_send_device_iotlb_msg) + return dev->vhost_ops->vhost_send_device_iotlb_msg(dev, &imsg); + + return -ENODEV; +} + +int vhost_backend_invalidate_device_iotlb(struct vhost_dev *dev, + uint64_t iova, uint64_t len) +{ + struct vhost_iotlb_msg imsg; + + imsg.iova = iova; + imsg.size = len; + imsg.type = VHOST_IOTLB_INVALIDATE; + + if (dev->vhost_ops && dev->vhost_ops->vhost_send_device_iotlb_msg) + return dev->vhost_ops->vhost_send_device_iotlb_msg(dev, &imsg); + + return -ENODEV; +} + +int vhost_backend_handle_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *imsg) +{ + int ret = 0; + + if (unlikely(!dev->vdev)) { + error_report("Unexpected IOTLB message when virtio device is stopped"); + return -EINVAL; + } + + switch (imsg->type) { + case VHOST_IOTLB_MISS: + ret = vhost_device_iotlb_miss(dev, imsg->iova, + imsg->perm != VHOST_ACCESS_RO); + break; + case VHOST_IOTLB_ACCESS_FAIL: + /* FIXME: report device iotlb error */ + error_report("Access failure IOTLB message type not supported"); + ret = -ENOTSUP; + break; + case VHOST_IOTLB_UPDATE: + case VHOST_IOTLB_INVALIDATE: + default: + error_report("Unexpected IOTLB message type"); + ret = -EINVAL; + break; + } + + return ret; +} diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c new file mode 100644 index 00000000..3d03395a --- /dev/null +++ b/hw/virtio/vhost-iova-tree.c @@ -0,0 +1,110 @@ +/* + * vhost software live migration iova tree + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2021 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/iova-tree.h" +#include "vhost-iova-tree.h" + +#define iova_min_addr qemu_real_host_page_size() + +/** + * VhostIOVATree, able to: + * - Translate iova address + * - Reverse translate iova address (from translated to iova) + * - Allocate IOVA regions for translated range (linear operation) + */ +struct VhostIOVATree { + /* First addressable iova address in the device */ + uint64_t iova_first; + + /* Last addressable iova address in the device */ + uint64_t iova_last; + + /* IOVA address to qemu memory maps. */ + IOVATree *iova_taddr_map; +}; + +/** + * Create a new IOVA tree + * + * Returns the new IOVA tree + */ +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last) +{ + VhostIOVATree *tree = g_new(VhostIOVATree, 1); + + /* Some devices do not like 0 addresses */ + tree->iova_first = MAX(iova_first, iova_min_addr); + tree->iova_last = iova_last; + + tree->iova_taddr_map = iova_tree_new(); + return tree; +} + +/** + * Delete an iova tree + */ +void vhost_iova_tree_delete(VhostIOVATree *iova_tree) +{ + iova_tree_destroy(iova_tree->iova_taddr_map); + g_free(iova_tree); +} + +/** + * Find the IOVA address stored from a memory address + * + * @tree: The iova tree + * @map: The map with the memory address + * + * Return the stored mapping, or NULL if not found. + */ +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree, + const DMAMap *map) +{ + return iova_tree_find_iova(tree->iova_taddr_map, map); +} + +/** + * Allocate a new mapping + * + * @tree: The iova tree + * @map: The iova map + * + * Returns: + * - IOVA_OK if the map fits in the container + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow) + * - IOVA_ERR_NOMEM if tree cannot allocate more space. + * + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK. + */ +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map) +{ + /* Some vhost devices do not like addr 0. Skip first page */ + hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size(); + + if (map->translated_addr + map->size < map->translated_addr || + map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* Allocate a node in IOVA address */ + return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first, + tree->iova_last); +} + +/** + * Remove existing mappings from iova tree + * + * @iova_tree: The vhost iova tree + * @map: The map to remove + */ +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map) +{ + iova_tree_remove(iova_tree->iova_taddr_map, map); +} diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h new file mode 100644 index 00000000..4adfd79f --- /dev/null +++ b/hw/virtio/vhost-iova-tree.h @@ -0,0 +1,27 @@ +/* + * vhost software live migration iova tree + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2021 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H +#define HW_VIRTIO_VHOST_IOVA_TREE_H + +#include "qemu/iova-tree.h" +#include "exec/memory.h" + +typedef struct VhostIOVATree VhostIOVATree; + +VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last); +void vhost_iova_tree_delete(VhostIOVATree *iova_tree); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete); + +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree, + const DMAMap *map); +int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map); +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, DMAMap map); + +#endif diff --git a/hw/virtio/vhost-scsi-pci.c b/hw/virtio/vhost-scsi-pci.c new file mode 100644 index 00000000..08980bc2 --- /dev/null +++ b/hw/virtio/vhost-scsi-pci.c @@ -0,0 +1,104 @@ +/* + * Vhost scsi PCI bindings + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * Changes for QEMU mainline + tcm_vhost kernel upstream: + * Nicholas Bellinger <nab@risingtidesystems.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "standard-headers/linux/virtio_pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-scsi.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VHostSCSIPCI VHostSCSIPCI; + +/* + * vhost-scsi-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VHOST_SCSI_PCI "vhost-scsi-pci-base" +DECLARE_INSTANCE_CHECKER(VHostSCSIPCI, VHOST_SCSI_PCI, + TYPE_VHOST_SCSI_PCI) + +struct VHostSCSIPCI { + VirtIOPCIProxy parent_obj; + VHostSCSI vdev; +}; + +static Property vhost_scsi_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostSCSIPCI *dev = VHOST_SCSI_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf; + + if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { + conf->num_queues = + virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED); + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_scsi_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_scsi_pci_realize; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, vhost_scsi_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; +} + +static void vhost_scsi_pci_instance_init(Object *obj) +{ + VHostSCSIPCI *dev = VHOST_SCSI_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_SCSI); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_scsi_pci_info = { + .base_name = TYPE_VHOST_SCSI_PCI, + .generic_name = "vhost-scsi-pci", + .transitional_name = "vhost-scsi-pci-transitional", + .non_transitional_name = "vhost-scsi-pci-non-transitional", + .instance_size = sizeof(VHostSCSIPCI), + .instance_init = vhost_scsi_pci_instance_init, + .class_init = vhost_scsi_pci_class_init, +}; + +static void vhost_scsi_pci_register(void) +{ + virtio_pci_types_register(&vhost_scsi_pci_info); +} + +type_init(vhost_scsi_pci_register) diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c new file mode 100644 index 00000000..5bd14cad --- /dev/null +++ b/hw/virtio/vhost-shadow-virtqueue.c @@ -0,0 +1,769 @@ +/* + * vhost shadow virtqueue + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2021 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/virtio/vhost-shadow-virtqueue.h" + +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qemu/main-loop.h" +#include "qemu/log.h" +#include "qemu/memalign.h" +#include "linux-headers/linux/vhost.h" + +/** + * Validate the transport device features that both guests can use with the SVQ + * and SVQs can use with the device. + * + * @dev_features: The features + * @errp: Error pointer + */ +bool vhost_svq_valid_features(uint64_t features, Error **errp) +{ + bool ok = true; + uint64_t svq_features = features; + + for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END; + ++b) { + switch (b) { + case VIRTIO_F_ANY_LAYOUT: + case VIRTIO_RING_F_EVENT_IDX: + continue; + + case VIRTIO_F_ACCESS_PLATFORM: + /* SVQ trust in the host's IOMMU to translate addresses */ + case VIRTIO_F_VERSION_1: + /* SVQ trust that the guest vring is little endian */ + if (!(svq_features & BIT_ULL(b))) { + svq_features |= BIT_ULL(b); + ok = false; + } + continue; + + default: + if (svq_features & BIT_ULL(b)) { + svq_features &= ~BIT_ULL(b); + ok = false; + } + } + } + + if (!ok) { + error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64 + ", ok: 0x%"PRIx64, features, svq_features); + } + return ok; +} + +/** + * Number of descriptors that the SVQ can make available from the guest. + * + * @svq: The svq + */ +static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq) +{ + return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx); +} + +/** + * Translate addresses between the qemu's virtual address and the SVQ IOVA + * + * @svq: Shadow VirtQueue + * @vaddr: Translated IOVA addresses + * @iovec: Source qemu's VA addresses + * @num: Length of iovec and minimum length of vaddr + */ +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq, + hwaddr *addrs, const struct iovec *iovec, + size_t num) +{ + if (num == 0) { + return true; + } + + for (size_t i = 0; i < num; ++i) { + DMAMap needle = { + .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base, + .size = iovec[i].iov_len, + }; + Int128 needle_last, map_last; + size_t off; + + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle); + /* + * Map cannot be NULL since iova map contains all guest space and + * qemu already has a physical address mapped + */ + if (unlikely(!map)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Invalid address 0x%"HWADDR_PRIx" given by guest", + needle.translated_addr); + return false; + } + + off = needle.translated_addr - map->translated_addr; + addrs[i] = map->iova + off; + + needle_last = int128_add(int128_make64(needle.translated_addr), + int128_make64(iovec[i].iov_len)); + map_last = int128_make64(map->translated_addr + map->size); + if (unlikely(int128_gt(needle_last, map_last))) { + qemu_log_mask(LOG_GUEST_ERROR, + "Guest buffer expands over iova range"); + return false; + } + } + + return true; +} + +/** + * Write descriptors to SVQ vring + * + * @svq: The shadow virtqueue + * @sg: Cache for hwaddr + * @iovec: The iovec from the guest + * @num: iovec length + * @more_descs: True if more descriptors come in the chain + * @write: True if they are writeable descriptors + * + * Return true if success, false otherwise and print error. + */ +static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg, + const struct iovec *iovec, size_t num, + bool more_descs, bool write) +{ + uint16_t i = svq->free_head, last = svq->free_head; + unsigned n; + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0; + vring_desc_t *descs = svq->vring.desc; + bool ok; + + if (num == 0) { + return true; + } + + ok = vhost_svq_translate_addr(svq, sg, iovec, num); + if (unlikely(!ok)) { + return false; + } + + for (n = 0; n < num; n++) { + if (more_descs || (n + 1 < num)) { + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT); + descs[i].next = cpu_to_le16(svq->desc_next[i]); + } else { + descs[i].flags = flags; + } + descs[i].addr = cpu_to_le64(sg[n]); + descs[i].len = cpu_to_le32(iovec[n].iov_len); + + last = i; + i = cpu_to_le16(svq->desc_next[i]); + } + + svq->free_head = le16_to_cpu(svq->desc_next[last]); + return true; +} + +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq, + const struct iovec *out_sg, size_t out_num, + const struct iovec *in_sg, size_t in_num, + unsigned *head) +{ + unsigned avail_idx; + vring_avail_t *avail = svq->vring.avail; + bool ok; + g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num)); + + *head = svq->free_head; + + /* We need some descriptors here */ + if (unlikely(!out_num && !in_num)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Guest provided element with no descriptors"); + return false; + } + + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0, + false); + if (unlikely(!ok)) { + return false; + } + + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true); + if (unlikely(!ok)) { + return false; + } + + /* + * Put the entry in the available array (but don't update avail->idx until + * they do sync). + */ + avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1); + avail->ring[avail_idx] = cpu_to_le16(*head); + svq->shadow_avail_idx++; + + /* Update the avail index after write the descriptor */ + smp_wmb(); + avail->idx = cpu_to_le16(svq->shadow_avail_idx); + + return true; +} + +static void vhost_svq_kick(VhostShadowVirtqueue *svq) +{ + bool needs_kick; + + /* + * We need to expose the available array entries before checking the used + * flags + */ + smp_mb(); + + if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + uint16_t avail_event = *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]); + needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx, svq->shadow_avail_idx - 1); + } else { + needs_kick = !(svq->vring.used->flags & VRING_USED_F_NO_NOTIFY); + } + + if (!needs_kick) { + return; + } + + event_notifier_set(&svq->hdev_kick); +} + +/** + * Add an element to a SVQ. + * + * Return -EINVAL if element is invalid, -ENOSPC if dev queue is full + */ +int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, + size_t out_num, const struct iovec *in_sg, size_t in_num, + VirtQueueElement *elem) +{ + unsigned qemu_head; + unsigned ndescs = in_num + out_num; + bool ok; + + if (unlikely(ndescs > vhost_svq_available_slots(svq))) { + return -ENOSPC; + } + + ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head); + if (unlikely(!ok)) { + return -EINVAL; + } + + svq->desc_state[qemu_head].elem = elem; + svq->desc_state[qemu_head].ndescs = ndescs; + vhost_svq_kick(svq); + return 0; +} + +/* Convenience wrapper to add a guest's element to SVQ */ +static int vhost_svq_add_element(VhostShadowVirtqueue *svq, + VirtQueueElement *elem) +{ + return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg, + elem->in_num, elem); +} + +/** + * Forward available buffers. + * + * @svq: Shadow VirtQueue + * + * Note that this function does not guarantee that all guest's available + * buffers are available to the device in SVQ avail ring. The guest may have + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in + * qemu vaddr. + * + * If that happens, guest's kick notifications will be disabled until the + * device uses some buffers. + */ +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq) +{ + /* Clear event notifier */ + event_notifier_test_and_clear(&svq->svq_kick); + + /* Forward to the device as many available buffers as possible */ + do { + virtio_queue_set_notification(svq->vq, false); + + while (true) { + g_autofree VirtQueueElement *elem = NULL; + int r; + + if (svq->next_guest_avail_elem) { + elem = g_steal_pointer(&svq->next_guest_avail_elem); + } else { + elem = virtqueue_pop(svq->vq, sizeof(*elem)); + } + + if (!elem) { + break; + } + + if (svq->ops) { + r = svq->ops->avail_handler(svq, elem, svq->ops_opaque); + } else { + r = vhost_svq_add_element(svq, elem); + } + if (unlikely(r != 0)) { + if (r == -ENOSPC) { + /* + * This condition is possible since a contiguous buffer in + * GPA does not imply a contiguous buffer in qemu's VA + * scatter-gather segments. If that happens, the buffer + * exposed to the device needs to be a chain of descriptors + * at this moment. + * + * SVQ cannot hold more available buffers if we are here: + * queue the current guest descriptor and ignore kicks + * until some elements are used. + */ + svq->next_guest_avail_elem = g_steal_pointer(&elem); + } + + /* VQ is full or broken, just return and ignore kicks */ + return; + } + /* elem belongs to SVQ or external caller now */ + elem = NULL; + } + + virtio_queue_set_notification(svq->vq, true); + } while (!virtio_queue_empty(svq->vq)); +} + +/** + * Handle guest's kick. + * + * @n: guest kick event notifier, the one that guest set to notify svq. + */ +static void vhost_handle_guest_kick_notifier(EventNotifier *n) +{ + VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick); + event_notifier_test_and_clear(n); + vhost_handle_guest_kick(svq); +} + +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq) +{ + uint16_t *used_idx = &svq->vring.used->idx; + if (svq->last_used_idx != svq->shadow_used_idx) { + return true; + } + + svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx); + + return svq->last_used_idx != svq->shadow_used_idx; +} + +/** + * Enable vhost device calls after disable them. + * + * @svq: The svq + * + * It returns false if there are pending used buffers from the vhost device, + * avoiding the possible races between SVQ checking for more work and enabling + * callbacks. True if SVQ used vring has no more pending buffers. + */ +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq) +{ + if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + uint16_t *used_event = (uint16_t *)&svq->vring.avail->ring[svq->vring.num]; + *used_event = svq->shadow_used_idx; + } else { + svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); + } + + /* Make sure the event is enabled before the read of used_idx */ + smp_mb(); + return !vhost_svq_more_used(svq); +} + +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq) +{ + /* + * No need to disable notification in the event idx case, since used event + * index is already an index too far away. + */ + if (!virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT); + } +} + +static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq, + uint16_t num, uint16_t i) +{ + for (uint16_t j = 0; j < (num - 1); ++j) { + i = le16_to_cpu(svq->desc_next[i]); + } + + return i; +} + +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq, + uint32_t *len) +{ + const vring_used_t *used = svq->vring.used; + vring_used_elem_t used_elem; + uint16_t last_used, last_used_chain, num; + + if (!vhost_svq_more_used(svq)) { + return NULL; + } + + /* Only get used array entries after they have been exposed by dev */ + smp_rmb(); + last_used = svq->last_used_idx & (svq->vring.num - 1); + used_elem.id = le32_to_cpu(used->ring[last_used].id); + used_elem.len = le32_to_cpu(used->ring[last_used].len); + + svq->last_used_idx++; + if (unlikely(used_elem.id >= svq->vring.num)) { + qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used", + svq->vdev->name, used_elem.id); + return NULL; + } + + if (unlikely(!svq->desc_state[used_elem.id].ndescs)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Device %s says index %u is used, but it was not available", + svq->vdev->name, used_elem.id); + return NULL; + } + + num = svq->desc_state[used_elem.id].ndescs; + svq->desc_state[used_elem.id].ndescs = 0; + last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id); + svq->desc_next[last_used_chain] = svq->free_head; + svq->free_head = used_elem.id; + + *len = used_elem.len; + return g_steal_pointer(&svq->desc_state[used_elem.id].elem); +} + +/** + * Push an element to SVQ, returning it to the guest. + */ +void vhost_svq_push_elem(VhostShadowVirtqueue *svq, + const VirtQueueElement *elem, uint32_t len) +{ + virtqueue_push(svq->vq, elem, len); + if (svq->next_guest_avail_elem) { + /* + * Avail ring was full when vhost_svq_flush was called, so it's a + * good moment to make more descriptors available if possible. + */ + vhost_handle_guest_kick(svq); + } +} + +static void vhost_svq_flush(VhostShadowVirtqueue *svq, + bool check_for_avail_queue) +{ + VirtQueue *vq = svq->vq; + + /* Forward as many used buffers as possible. */ + do { + unsigned i = 0; + + vhost_svq_disable_notification(svq); + while (true) { + uint32_t len; + g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len); + if (!elem) { + break; + } + + if (unlikely(i >= svq->vring.num)) { + qemu_log_mask(LOG_GUEST_ERROR, + "More than %u used buffers obtained in a %u size SVQ", + i, svq->vring.num); + virtqueue_fill(vq, elem, len, i); + virtqueue_flush(vq, i); + return; + } + virtqueue_fill(vq, elem, len, i++); + } + + virtqueue_flush(vq, i); + event_notifier_set(&svq->svq_call); + + if (check_for_avail_queue && svq->next_guest_avail_elem) { + /* + * Avail ring was full when vhost_svq_flush was called, so it's a + * good moment to make more descriptors available if possible. + */ + vhost_handle_guest_kick(svq); + } + } while (!vhost_svq_enable_notification(svq)); +} + +/** + * Poll the SVQ for one device used buffer. + * + * This function race with main event loop SVQ polling, so extra + * synchronization is needed. + * + * Return the length written by the device. + */ +size_t vhost_svq_poll(VhostShadowVirtqueue *svq) +{ + int64_t start_us = g_get_monotonic_time(); + uint32_t len; + + do { + if (vhost_svq_more_used(svq)) { + break; + } + + if (unlikely(g_get_monotonic_time() - start_us > 10e6)) { + return 0; + } + } while (true); + + vhost_svq_get_buf(svq, &len); + return len; +} + +/** + * Forward used buffers. + * + * @n: hdev call event notifier, the one that device set to notify svq. + * + * Note that we are not making any buffers available in the loop, there is no + * way that it runs more than virtqueue size times. + */ +static void vhost_svq_handle_call(EventNotifier *n) +{ + VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, + hdev_call); + event_notifier_test_and_clear(n); + vhost_svq_flush(svq, true); +} + +/** + * Set the call notifier for the SVQ to call the guest + * + * @svq: Shadow virtqueue + * @call_fd: call notifier + * + * Called on BQL context. + */ +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd) +{ + if (call_fd == VHOST_FILE_UNBIND) { + /* + * Fail event_notifier_set if called handling device call. + * + * SVQ still needs device notifications, since it needs to keep + * forwarding used buffers even with the unbind. + */ + memset(&svq->svq_call, 0, sizeof(svq->svq_call)); + } else { + event_notifier_init_fd(&svq->svq_call, call_fd); + } +} + +/** + * Get the shadow vq vring address. + * @svq: Shadow virtqueue + * @addr: Destination to store address + */ +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq, + struct vhost_vring_addr *addr) +{ + addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc; + addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail; + addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used; +} + +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq) +{ + size_t desc_size = sizeof(vring_desc_t) * svq->vring.num; + size_t avail_size = offsetof(vring_avail_t, ring[svq->vring.num]) + + sizeof(uint16_t); + + return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size()); +} + +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq) +{ + size_t used_size = offsetof(vring_used_t, ring[svq->vring.num]) + + sizeof(uint16_t); + return ROUND_UP(used_size, qemu_real_host_page_size()); +} + +/** + * Set a new file descriptor for the guest to kick the SVQ and notify for avail + * + * @svq: The svq + * @svq_kick_fd: The svq kick fd + * + * Note that the SVQ will never close the old file descriptor. + */ +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd) +{ + EventNotifier *svq_kick = &svq->svq_kick; + bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick); + bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND; + + if (poll_stop) { + event_notifier_set_handler(svq_kick, NULL); + } + + event_notifier_init_fd(svq_kick, svq_kick_fd); + /* + * event_notifier_set_handler already checks for guest's notifications if + * they arrive at the new file descriptor in the switch, so there is no + * need to explicitly check for them. + */ + if (poll_start) { + event_notifier_set(svq_kick); + event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier); + } +} + +/** + * Start the shadow virtqueue operation. + * + * @svq: Shadow Virtqueue + * @vdev: VirtIO device + * @vq: Virtqueue to shadow + */ +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, + VirtQueue *vq) +{ + size_t desc_size, driver_size, device_size; + + svq->next_guest_avail_elem = NULL; + svq->shadow_avail_idx = 0; + svq->shadow_used_idx = 0; + svq->last_used_idx = 0; + svq->vdev = vdev; + svq->vq = vq; + + svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq)); + driver_size = vhost_svq_driver_area_size(svq); + device_size = vhost_svq_device_area_size(svq); + svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size); + desc_size = sizeof(vring_desc_t) * svq->vring.num; + svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size); + memset(svq->vring.desc, 0, driver_size); + svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size); + memset(svq->vring.used, 0, device_size); + svq->desc_state = g_new0(SVQDescState, svq->vring.num); + svq->desc_next = g_new0(uint16_t, svq->vring.num); + for (unsigned i = 0; i < svq->vring.num - 1; i++) { + svq->desc_next[i] = cpu_to_le16(i + 1); + } +} + +/** + * Stop the shadow virtqueue operation. + * @svq: Shadow Virtqueue + */ +void vhost_svq_stop(VhostShadowVirtqueue *svq) +{ + vhost_svq_set_svq_kick_fd(svq, VHOST_FILE_UNBIND); + g_autofree VirtQueueElement *next_avail_elem = NULL; + + if (!svq->vq) { + return; + } + + /* Send all pending used descriptors to guest */ + vhost_svq_flush(svq, false); + + for (unsigned i = 0; i < svq->vring.num; ++i) { + g_autofree VirtQueueElement *elem = NULL; + elem = g_steal_pointer(&svq->desc_state[i].elem); + if (elem) { + virtqueue_detach_element(svq->vq, elem, 0); + } + } + + next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem); + if (next_avail_elem) { + virtqueue_detach_element(svq->vq, next_avail_elem, 0); + } + svq->vq = NULL; + g_free(svq->desc_next); + g_free(svq->desc_state); + qemu_vfree(svq->vring.desc); + qemu_vfree(svq->vring.used); +} + +/** + * Creates vhost shadow virtqueue, and instructs the vhost device to use the + * shadow methods and file descriptors. + * + * @iova_tree: Tree to perform descriptors translations + * @ops: SVQ owner callbacks + * @ops_opaque: ops opaque pointer + * + * Returns the new virtqueue or NULL. + * + * In case of error, reason is reported through error_report. + */ +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree, + const VhostShadowVirtqueueOps *ops, + void *ops_opaque) +{ + g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1); + int r; + + r = event_notifier_init(&svq->hdev_kick, 0); + if (r != 0) { + error_report("Couldn't create kick event notifier: %s (%d)", + g_strerror(errno), errno); + goto err_init_hdev_kick; + } + + r = event_notifier_init(&svq->hdev_call, 0); + if (r != 0) { + error_report("Couldn't create call event notifier: %s (%d)", + g_strerror(errno), errno); + goto err_init_hdev_call; + } + + event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); + event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call); + svq->iova_tree = iova_tree; + svq->ops = ops; + svq->ops_opaque = ops_opaque; + return g_steal_pointer(&svq); + +err_init_hdev_call: + event_notifier_cleanup(&svq->hdev_kick); + +err_init_hdev_kick: + return NULL; +} + +/** + * Free the resources of the shadow virtqueue. + * + * @pvq: gpointer to SVQ so it can be used by autofree functions. + */ +void vhost_svq_free(gpointer pvq) +{ + VhostShadowVirtqueue *vq = pvq; + vhost_svq_stop(vq); + event_notifier_cleanup(&vq->hdev_kick); + event_notifier_set_handler(&vq->hdev_call, NULL); + event_notifier_cleanup(&vq->hdev_call); + g_free(vq); +} diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h new file mode 100644 index 00000000..d04c34a5 --- /dev/null +++ b/hw/virtio/vhost-shadow-virtqueue.h @@ -0,0 +1,139 @@ +/* + * vhost shadow virtqueue + * + * SPDX-FileCopyrightText: Red Hat, Inc. 2021 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef VHOST_SHADOW_VIRTQUEUE_H +#define VHOST_SHADOW_VIRTQUEUE_H + +#include "qemu/event_notifier.h" +#include "hw/virtio/virtio.h" +#include "standard-headers/linux/vhost_types.h" +#include "hw/virtio/vhost-iova-tree.h" + +typedef struct SVQDescState { + VirtQueueElement *elem; + + /* + * Number of descriptors exposed to the device. May or may not match + * guest's + */ + unsigned int ndescs; +} SVQDescState; + +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue; + +/** + * Callback to handle an avail buffer. + * + * @svq: Shadow virtqueue + * @elem: Element placed in the queue by the guest + * @vq_callback_opaque: Opaque + * + * Returns 0 if the vq is running as expected. + * + * Note that ownership of elem is transferred to the callback. + */ +typedef int (*VirtQueueAvailCallback)(VhostShadowVirtqueue *svq, + VirtQueueElement *elem, + void *vq_callback_opaque); + +typedef struct VhostShadowVirtqueueOps { + VirtQueueAvailCallback avail_handler; +} VhostShadowVirtqueueOps; + +/* Shadow virtqueue to relay notifications */ +typedef struct VhostShadowVirtqueue { + /* Shadow vring */ + struct vring vring; + + /* Shadow kick notifier, sent to vhost */ + EventNotifier hdev_kick; + /* Shadow call notifier, sent to vhost */ + EventNotifier hdev_call; + + /* + * Borrowed virtqueue's guest to host notifier. To borrow it in this event + * notifier allows to recover the VhostShadowVirtqueue from the event loop + * easily. If we use the VirtQueue's one, we don't have an easy way to + * retrieve VhostShadowVirtqueue. + * + * So shadow virtqueue must not clean it, or we would lose VirtQueue one. + */ + EventNotifier svq_kick; + + /* Guest's call notifier, where the SVQ calls guest. */ + EventNotifier svq_call; + + /* Virtio queue shadowing */ + VirtQueue *vq; + + /* Virtio device */ + VirtIODevice *vdev; + + /* IOVA mapping */ + VhostIOVATree *iova_tree; + + /* SVQ vring descriptors state */ + SVQDescState *desc_state; + + /* Next VirtQueue element that guest made available */ + VirtQueueElement *next_guest_avail_elem; + + /* + * Backup next field for each descriptor so we can recover securely, not + * needing to trust the device access. + */ + uint16_t *desc_next; + + /* Caller callbacks */ + const VhostShadowVirtqueueOps *ops; + + /* Caller callbacks opaque */ + void *ops_opaque; + + /* Next head to expose to the device */ + uint16_t shadow_avail_idx; + + /* Next free descriptor */ + uint16_t free_head; + + /* Last seen used idx */ + uint16_t shadow_used_idx; + + /* Next head to consume from the device */ + uint16_t last_used_idx; +} VhostShadowVirtqueue; + +bool vhost_svq_valid_features(uint64_t features, Error **errp); + +void vhost_svq_push_elem(VhostShadowVirtqueue *svq, + const VirtQueueElement *elem, uint32_t len); +int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg, + size_t out_num, const struct iovec *in_sg, size_t in_num, + VirtQueueElement *elem); +size_t vhost_svq_poll(VhostShadowVirtqueue *svq); + +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd); +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd); +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq, + struct vhost_vring_addr *addr); +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq); +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq); + +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev, + VirtQueue *vq); +void vhost_svq_stop(VhostShadowVirtqueue *svq); + +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree, + const VhostShadowVirtqueueOps *ops, + void *ops_opaque); + +void vhost_svq_free(gpointer vq); +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free); + +#endif diff --git a/hw/virtio/vhost-stub.c b/hw/virtio/vhost-stub.c new file mode 100644 index 00000000..c175148f --- /dev/null +++ b/hw/virtio/vhost-stub.c @@ -0,0 +1,17 @@ +#include "qemu/osdep.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-user.h" + +bool vhost_has_free_slot(void) +{ + return true; +} + +bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) +{ + return false; +} + +void vhost_user_cleanup(VhostUserState *user) +{ +} diff --git a/hw/virtio/vhost-user-blk-pci.c b/hw/virtio/vhost-user-blk-pci.c new file mode 100644 index 00000000..eef8641a --- /dev/null +++ b/hw/virtio/vhost-user-blk-pci.c @@ -0,0 +1,109 @@ +/* + * Vhost user blk PCI Bindings + * + * Copyright(C) 2017 Intel Corporation. + * + * Authors: + * Changpeng Liu <changpeng.liu@intel.com> + * + * Largely based on the "vhost-user-scsi.c" and "vhost-scsi.c" implemented by: + * Felipe Franciosi <felipe@nutanix.com> + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Nicholas Bellinger <nab@risingtidesystems.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "standard-headers/linux/virtio_pci.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/vhost-user-blk.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VHostUserBlkPCI VHostUserBlkPCI; + +/* + * vhost-user-blk-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci-base" +DECLARE_INSTANCE_CHECKER(VHostUserBlkPCI, VHOST_USER_BLK_PCI, + TYPE_VHOST_USER_BLK_PCI) + +struct VHostUserBlkPCI { + VirtIOPCIProxy parent_obj; + VHostUserBlk vdev; +}; + +static Property vhost_user_blk_pci_properties[] = { + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + if (dev->vdev.num_queues == VHOST_USER_BLK_AUTO_NUM_QUEUES) { + dev->vdev.num_queues = virtio_pci_optimal_num_queues(0); + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = dev->vdev.num_queues + 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, vhost_user_blk_pci_properties); + k->realize = vhost_user_blk_pci_realize; + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; +} + +static void vhost_user_blk_pci_instance_init(Object *obj) +{ + VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_BLK); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_blk_pci_info = { + .base_name = TYPE_VHOST_USER_BLK_PCI, + .generic_name = "vhost-user-blk-pci", + .transitional_name = "vhost-user-blk-pci-transitional", + .non_transitional_name = "vhost-user-blk-pci-non-transitional", + .instance_size = sizeof(VHostUserBlkPCI), + .instance_init = vhost_user_blk_pci_instance_init, + .class_init = vhost_user_blk_pci_class_init, +}; + +static void vhost_user_blk_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_blk_pci_info); +} + +type_init(vhost_user_blk_pci_register) diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c new file mode 100644 index 00000000..6829b8b7 --- /dev/null +++ b/hw/virtio/vhost-user-fs-pci.c @@ -0,0 +1,88 @@ +/* + * Vhost-user filesystem virtio device PCI glue + * + * Copyright 2018-2019 Red Hat, Inc. + * + * Authors: + * Dr. David Alan Gilbert <dgilbert@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-fs.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +struct VHostUserFSPCI { + VirtIOPCIProxy parent_obj; + VHostUserFS vdev; +}; + +typedef struct VHostUserFSPCI VHostUserFSPCI; + +#define TYPE_VHOST_USER_FS_PCI "vhost-user-fs-pci-base" + +DECLARE_INSTANCE_CHECKER(VHostUserFSPCI, VHOST_USER_FS_PCI, + TYPE_VHOST_USER_FS_PCI) + +static Property vhost_user_fs_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserFSPCI *dev = VHOST_USER_FS_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + /* Also reserve config change and hiprio queue vectors */ + vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 2; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_fs_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_fs_pci_realize; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, vhost_user_fs_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */ + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_STORAGE_OTHER; +} + +static void vhost_user_fs_pci_instance_init(Object *obj) +{ + VHostUserFSPCI *dev = VHOST_USER_FS_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_FS); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_fs_pci_info = { + .base_name = TYPE_VHOST_USER_FS_PCI, + .non_transitional_name = "vhost-user-fs-pci", + .instance_size = sizeof(VHostUserFSPCI), + .instance_init = vhost_user_fs_pci_instance_init, + .class_init = vhost_user_fs_pci_class_init, +}; + +static void vhost_user_fs_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_fs_pci_info); +} + +type_init(vhost_user_fs_pci_register); diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c new file mode 100644 index 00000000..d97b179e --- /dev/null +++ b/hw/virtio/vhost-user-fs.c @@ -0,0 +1,336 @@ +/* + * Vhost-user filesystem virtio device + * + * Copyright 2018-2019 Red Hat, Inc. + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include "standard-headers/linux/virtio_fs.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-properties-system.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "qemu/error-report.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-user-fs.h" +#include "monitor/monitor.h" +#include "sysemu/sysemu.h" + +static const int user_feature_bits[] = { + VIRTIO_F_VERSION_1, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_F_RING_PACKED, + VIRTIO_F_IOMMU_PLATFORM, + VIRTIO_F_RING_RESET, + + VHOST_INVALID_FEATURE_BIT +}; + +static void vuf_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + struct virtio_fs_config fscfg = {}; + + memcpy((char *)fscfg.tag, fs->conf.tag, + MIN(strlen(fs->conf.tag) + 1, sizeof(fscfg.tag))); + + virtio_stl_p(vdev, &fscfg.num_request_queues, fs->conf.num_request_queues); + + memcpy(config, &fscfg, sizeof(fscfg)); +} + +static void vuf_start(VirtIODevice *vdev) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + int i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return; + } + + ret = vhost_dev_enable_notifiers(&fs->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return; + } + + ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + fs->vhost_dev.acked_features = vdev->guest_features; + ret = vhost_dev_start(&fs->vhost_dev, vdev, true); + if (ret < 0) { + error_report("Error starting vhost: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < fs->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&fs->vhost_dev, vdev, i, false); + } + + return; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&fs->vhost_dev, vdev); +} + +static void vuf_stop(VirtIODevice *vdev) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&fs->vhost_dev, vdev, true); + + ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&fs->vhost_dev, vdev); +} + +static void vuf_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + bool should_start = virtio_device_should_start(vdev, status); + + if (vhost_dev_is_started(&fs->vhost_dev) == should_start) { + return; + } + + if (should_start) { + vuf_start(vdev); + } else { + vuf_stop(vdev); + } +} + +static uint64_t vuf_get_features(VirtIODevice *vdev, + uint64_t features, + Error **errp) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + + return vhost_get_features(&fs->vhost_dev, user_feature_bits, features); +} + +static void vuf_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* + * Not normally called; it's the daemon that handles the queue; + * however virtio's cleanup path can call this. + */ +} + +static void vuf_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + + vhost_virtqueue_mask(&fs->vhost_dev, vdev, idx, mask); +} + +static bool vuf_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + + return vhost_virtqueue_pending(&fs->vhost_dev, idx); +} + +static void vuf_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserFS *fs = VHOST_USER_FS(dev); + unsigned int i; + size_t len; + int ret; + + if (!fs->conf.chardev.chr) { + error_setg(errp, "missing chardev"); + return; + } + + if (!fs->conf.tag) { + error_setg(errp, "missing tag property"); + return; + } + len = strlen(fs->conf.tag); + if (len == 0) { + error_setg(errp, "tag property cannot be empty"); + return; + } + if (len > sizeof_field(struct virtio_fs_config, tag)) { + error_setg(errp, "tag property must be %zu bytes or less", + sizeof_field(struct virtio_fs_config, tag)); + return; + } + + if (fs->conf.num_request_queues == 0) { + error_setg(errp, "num-request-queues property must be larger than 0"); + return; + } + + if (!is_power_of_2(fs->conf.queue_size)) { + error_setg(errp, "queue-size property must be a power of 2"); + return; + } + + if (fs->conf.queue_size > VIRTQUEUE_MAX_SIZE) { + error_setg(errp, "queue-size property must be %u or smaller", + VIRTQUEUE_MAX_SIZE); + return; + } + + if (!vhost_user_init(&fs->vhost_user, &fs->conf.chardev, errp)) { + return; + } + + virtio_init(vdev, VIRTIO_ID_FS, sizeof(struct virtio_fs_config)); + + /* Hiprio queue */ + fs->hiprio_vq = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); + + /* Request queues */ + fs->req_vqs = g_new(VirtQueue *, fs->conf.num_request_queues); + for (i = 0; i < fs->conf.num_request_queues; i++) { + fs->req_vqs[i] = virtio_add_queue(vdev, fs->conf.queue_size, vuf_handle_output); + } + + /* 1 high prio queue, plus the number configured */ + fs->vhost_dev.nvqs = 1 + fs->conf.num_request_queues; + fs->vhost_dev.vqs = g_new0(struct vhost_virtqueue, fs->vhost_dev.nvqs); + ret = vhost_dev_init(&fs->vhost_dev, &fs->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + goto err_virtio; + } + + return; + +err_virtio: + vhost_user_cleanup(&fs->vhost_user); + virtio_delete_queue(fs->hiprio_vq); + for (i = 0; i < fs->conf.num_request_queues; i++) { + virtio_delete_queue(fs->req_vqs[i]); + } + g_free(fs->req_vqs); + virtio_cleanup(vdev); + g_free(fs->vhost_dev.vqs); + return; +} + +static void vuf_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserFS *fs = VHOST_USER_FS(dev); + int i; + + /* This will stop vhost backend if appropriate. */ + vuf_set_status(vdev, 0); + + vhost_dev_cleanup(&fs->vhost_dev); + + vhost_user_cleanup(&fs->vhost_user); + + virtio_delete_queue(fs->hiprio_vq); + for (i = 0; i < fs->conf.num_request_queues; i++) { + virtio_delete_queue(fs->req_vqs[i]); + } + g_free(fs->req_vqs); + virtio_cleanup(vdev); + g_free(fs->vhost_dev.vqs); + fs->vhost_dev.vqs = NULL; +} + +static struct vhost_dev *vuf_get_vhost(VirtIODevice *vdev) +{ + VHostUserFS *fs = VHOST_USER_FS(vdev); + return &fs->vhost_dev; +} + +static const VMStateDescription vuf_vmstate = { + .name = "vhost-user-fs", + .unmigratable = 1, +}; + +static Property vuf_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserFS, conf.chardev), + DEFINE_PROP_STRING("tag", VHostUserFS, conf.tag), + DEFINE_PROP_UINT16("num-request-queues", VHostUserFS, + conf.num_request_queues, 1), + DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vuf_instance_init(Object *obj) +{ + VHostUserFS *fs = VHOST_USER_FS(obj); + + device_add_bootindex_property(obj, &fs->bootindex, "bootindex", + "/filesystem@0", DEVICE(obj)); +} + +static void vuf_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vuf_properties); + dc->vmsd = &vuf_vmstate; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + vdc->realize = vuf_device_realize; + vdc->unrealize = vuf_device_unrealize; + vdc->get_features = vuf_get_features; + vdc->get_config = vuf_get_config; + vdc->set_status = vuf_set_status; + vdc->guest_notifier_mask = vuf_guest_notifier_mask; + vdc->guest_notifier_pending = vuf_guest_notifier_pending; + vdc->get_vhost = vuf_get_vhost; +} + +static const TypeInfo vuf_info = { + .name = TYPE_VHOST_USER_FS, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostUserFS), + .instance_init = vuf_instance_init, + .class_init = vuf_class_init, +}; + +static void vuf_register_types(void) +{ + type_register_static(&vuf_info); +} + +type_init(vuf_register_types) diff --git a/hw/virtio/vhost-user-gpio-pci.c b/hw/virtio/vhost-user-gpio-pci.c new file mode 100644 index 00000000..b3028a24 --- /dev/null +++ b/hw/virtio/vhost-user-gpio-pci.c @@ -0,0 +1,69 @@ +/* + * Vhost-user gpio virtio device PCI glue + * + * Copyright (c) 2022 Viresh Kumar <viresh.kumar@linaro.org> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-gpio.h" +#include "hw/virtio/virtio-pci.h" + +struct VHostUserGPIOPCI { + VirtIOPCIProxy parent_obj; + VHostUserGPIO vdev; +}; + +typedef struct VHostUserGPIOPCI VHostUserGPIOPCI; + +#define TYPE_VHOST_USER_GPIO_PCI "vhost-user-gpio-pci-base" + +DECLARE_INSTANCE_CHECKER(VHostUserGPIOPCI, VHOST_USER_GPIO_PCI, + TYPE_VHOST_USER_GPIO_PCI) + +static void vhost_user_gpio_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserGPIOPCI *dev = VHOST_USER_GPIO_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + vpci_dev->nvectors = 1; + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_gpio_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_gpio_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */ + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_user_gpio_pci_instance_init(Object *obj) +{ + VHostUserGPIOPCI *dev = VHOST_USER_GPIO_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_GPIO); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_gpio_pci_info = { + .base_name = TYPE_VHOST_USER_GPIO_PCI, + .non_transitional_name = "vhost-user-gpio-pci", + .instance_size = sizeof(VHostUserGPIOPCI), + .instance_init = vhost_user_gpio_pci_instance_init, + .class_init = vhost_user_gpio_pci_class_init, +}; + +static void vhost_user_gpio_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_gpio_pci_info); +} + +type_init(vhost_user_gpio_pci_register); diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c new file mode 100644 index 00000000..b7b82a10 --- /dev/null +++ b/hw/virtio/vhost-user-gpio.c @@ -0,0 +1,418 @@ +/* + * Vhost-user GPIO virtio device + * + * Copyright (c) 2022 Viresh Kumar <viresh.kumar@linaro.org> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/vhost-user-gpio.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/virtio_ids.h" +#include "trace.h" + +#define REALIZE_CONNECTION_RETRIES 3 + +/* Features required from VirtIO */ +static const int feature_bits[] = { + VIRTIO_F_VERSION_1, + VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_GPIO_F_IRQ, + VIRTIO_F_RING_RESET, + VHOST_INVALID_FEATURE_BIT +}; + +static void vu_gpio_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + + memcpy(config, &gpio->config, sizeof(gpio->config)); +} + +static int vu_gpio_config_notifier(struct vhost_dev *dev) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(dev->vdev); + + memcpy(dev->vdev->config, &gpio->config, sizeof(gpio->config)); + virtio_notify_config(dev->vdev); + + return 0; +} + +const VhostDevConfigOps gpio_ops = { + .vhost_dev_config_notifier = vu_gpio_config_notifier, +}; + +static int vu_gpio_start(VirtIODevice *vdev) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + struct vhost_dev *vhost_dev = &gpio->vhost_dev; + int ret, i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return -ENOSYS; + } + + ret = vhost_dev_enable_notifiers(vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", ret); + return ret; + } + + ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", ret); + goto err_host_notifiers; + } + + /* + * Before we start up we need to ensure we have the final feature + * set needed for the vhost configuration. The backend may also + * apply backend_features when the feature set is sent. + */ + vhost_ack_features(&gpio->vhost_dev, feature_bits, vdev->guest_features); + + ret = vhost_dev_start(&gpio->vhost_dev, vdev, false); + if (ret < 0) { + error_report("Error starting vhost-user-gpio: %d", ret); + goto err_guest_notifiers; + } + gpio->started_vu = true; + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < gpio->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&gpio->vhost_dev, vdev, i, false); + } + + /* + * As we must have VHOST_USER_F_PROTOCOL_FEATURES (because + * VHOST_USER_GET_CONFIG requires it) we need to explicitly enable + * the vrings. + */ + g_assert(vhost_dev->vhost_ops && + vhost_dev->vhost_ops->vhost_set_vring_enable); + ret = vhost_dev->vhost_ops->vhost_set_vring_enable(vhost_dev, true); + if (ret == 0) { + return 0; + } + + error_report("Failed to start vrings for vhost-user-gpio: %d", ret); + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, gpio->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&gpio->vhost_dev, vdev); + + return ret; +} + +static void vu_gpio_stop(VirtIODevice *vdev) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + struct vhost_dev *vhost_dev = &gpio->vhost_dev; + int ret; + + if (!gpio->started_vu) { + return; + } + gpio->started_vu = false; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(vhost_dev, vdev, false); + + ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(vhost_dev, vdev); +} + +static void vu_gpio_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + bool should_start = virtio_device_should_start(vdev, status); + + trace_virtio_gpio_set_status(status); + + if (!gpio->connected) { + return; + } + + if (vhost_dev_is_started(&gpio->vhost_dev) == should_start) { + return; + } + + if (should_start) { + if (vu_gpio_start(vdev)) { + qemu_chr_fe_disconnect(&gpio->chardev); + } + } else { + vu_gpio_stop(vdev); + } +} + +static uint64_t vu_gpio_get_features(VirtIODevice *vdev, uint64_t features, + Error **errp) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + + return vhost_get_features(&gpio->vhost_dev, feature_bits, features); +} + +static void vu_gpio_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* + * Not normally called; it's the daemon that handles the queue; + * however virtio's cleanup path can call this. + */ +} + +static void vu_gpio_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) +{ + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + + vhost_virtqueue_mask(&gpio->vhost_dev, vdev, idx, mask); +} + +static void do_vhost_user_cleanup(VirtIODevice *vdev, VHostUserGPIO *gpio) +{ + virtio_delete_queue(gpio->command_vq); + virtio_delete_queue(gpio->interrupt_vq); + g_free(gpio->vhost_dev.vqs); + gpio->vhost_dev.vqs = NULL; + virtio_cleanup(vdev); + vhost_user_cleanup(&gpio->vhost_user); +} + +static int vu_gpio_connect(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + struct vhost_dev *vhost_dev = &gpio->vhost_dev; + int ret; + + if (gpio->connected) { + return 0; + } + gpio->connected = true; + + vhost_dev_set_config_notifier(vhost_dev, &gpio_ops); + gpio->vhost_user.supports_config = true; + + ret = vhost_dev_init(vhost_dev, &gpio->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + return ret; + } + + /* restore vhost state */ + if (virtio_device_started(vdev, vdev->status)) { + vu_gpio_start(vdev); + } + + return 0; +} + +static void vu_gpio_event(void *opaque, QEMUChrEvent event); + +static void vu_gpio_disconnect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + + if (!gpio->connected) { + return; + } + gpio->connected = false; + + vu_gpio_stop(vdev); + vhost_dev_cleanup(&gpio->vhost_dev); + + /* Re-instate the event handler for new connections */ + qemu_chr_fe_set_handlers(&gpio->chardev, + NULL, NULL, vu_gpio_event, + NULL, dev, NULL, true); +} + +static void vu_gpio_event(void *opaque, QEMUChrEvent event) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev); + Error *local_err = NULL; + + switch (event) { + case CHR_EVENT_OPENED: + if (vu_gpio_connect(dev, &local_err) < 0) { + qemu_chr_fe_disconnect(&gpio->chardev); + return; + } + break; + case CHR_EVENT_CLOSED: + /* defer close until later to avoid circular close */ + vhost_user_async_close(dev, &gpio->chardev, &gpio->vhost_dev, + vu_gpio_disconnect); + break; + case CHR_EVENT_BREAK: + case CHR_EVENT_MUX_IN: + case CHR_EVENT_MUX_OUT: + /* Ignore */ + break; + } +} + +static int vu_gpio_realize_connect(VHostUserGPIO *gpio, Error **errp) +{ + VirtIODevice *vdev = &gpio->parent_obj; + DeviceState *dev = &vdev->parent_obj; + struct vhost_dev *vhost_dev = &gpio->vhost_dev; + int ret; + + ret = qemu_chr_fe_wait_connected(&gpio->chardev, errp); + if (ret < 0) { + return ret; + } + + /* + * vu_gpio_connect() may have already connected (via the event + * callback) in which case it will just report success. + */ + ret = vu_gpio_connect(dev, errp); + if (ret < 0) { + qemu_chr_fe_disconnect(&gpio->chardev); + return ret; + } + g_assert(gpio->connected); + + ret = vhost_dev_get_config(vhost_dev, (uint8_t *)&gpio->config, + sizeof(gpio->config), errp); + + if (ret < 0) { + error_report("vhost-user-gpio: get config failed"); + + qemu_chr_fe_disconnect(&gpio->chardev); + vhost_dev_cleanup(vhost_dev); + return ret; + } + + return 0; +} + +static void vu_gpio_device_realize(DeviceState *dev, Error **errp) +{ + ERRP_GUARD(); + + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserGPIO *gpio = VHOST_USER_GPIO(dev); + int retries, ret; + + if (!gpio->chardev.chr) { + error_setg(errp, "vhost-user-gpio: chardev is mandatory"); + return; + } + + if (!vhost_user_init(&gpio->vhost_user, &gpio->chardev, errp)) { + return; + } + + virtio_init(vdev, VIRTIO_ID_GPIO, sizeof(gpio->config)); + + gpio->vhost_dev.nvqs = 2; + gpio->command_vq = virtio_add_queue(vdev, 256, vu_gpio_handle_output); + gpio->interrupt_vq = virtio_add_queue(vdev, 256, vu_gpio_handle_output); + gpio->vhost_dev.vqs = g_new0(struct vhost_virtqueue, gpio->vhost_dev.nvqs); + + gpio->connected = false; + + qemu_chr_fe_set_handlers(&gpio->chardev, NULL, NULL, vu_gpio_event, NULL, + dev, NULL, true); + + retries = REALIZE_CONNECTION_RETRIES; + g_assert(!*errp); + do { + if (*errp) { + error_prepend(errp, "Reconnecting after error: "); + error_report_err(*errp); + *errp = NULL; + } + ret = vu_gpio_realize_connect(gpio, errp); + } while (ret < 0 && retries--); + + if (ret < 0) { + do_vhost_user_cleanup(vdev, gpio); + } + + return; +} + +static void vu_gpio_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserGPIO *gpio = VHOST_USER_GPIO(dev); + + vu_gpio_set_status(vdev, 0); + qemu_chr_fe_set_handlers(&gpio->chardev, NULL, NULL, NULL, NULL, NULL, NULL, + false); + vhost_dev_cleanup(&gpio->vhost_dev); + do_vhost_user_cleanup(vdev, gpio); +} + +static const VMStateDescription vu_gpio_vmstate = { + .name = "vhost-user-gpio", + .unmigratable = 1, +}; + +static Property vu_gpio_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserGPIO, chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vu_gpio_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vu_gpio_properties); + dc->vmsd = &vu_gpio_vmstate; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + vdc->realize = vu_gpio_device_realize; + vdc->unrealize = vu_gpio_device_unrealize; + vdc->get_features = vu_gpio_get_features; + vdc->get_config = vu_gpio_get_config; + vdc->set_status = vu_gpio_set_status; + vdc->guest_notifier_mask = vu_gpio_guest_notifier_mask; +} + +static const TypeInfo vu_gpio_info = { + .name = TYPE_VHOST_USER_GPIO, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostUserGPIO), + .class_init = vu_gpio_class_init, +}; + +static void vu_gpio_register_types(void) +{ + type_register_static(&vu_gpio_info); +} + +type_init(vu_gpio_register_types) diff --git a/hw/virtio/vhost-user-i2c-pci.c b/hw/virtio/vhost-user-i2c-pci.c new file mode 100644 index 00000000..00ac1094 --- /dev/null +++ b/hw/virtio/vhost-user-i2c-pci.c @@ -0,0 +1,69 @@ +/* + * Vhost-user i2c virtio device PCI glue + * + * Copyright (c) 2021 Viresh Kumar <viresh.kumar@linaro.org> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-i2c.h" +#include "hw/virtio/virtio-pci.h" + +struct VHostUserI2CPCI { + VirtIOPCIProxy parent_obj; + VHostUserI2C vdev; +}; + +typedef struct VHostUserI2CPCI VHostUserI2CPCI; + +#define TYPE_VHOST_USER_I2C_PCI "vhost-user-i2c-pci-base" + +DECLARE_INSTANCE_CHECKER(VHostUserI2CPCI, VHOST_USER_I2C_PCI, + TYPE_VHOST_USER_I2C_PCI) + +static void vhost_user_i2c_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + vpci_dev->nvectors = 1; + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_i2c_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_i2c_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */ + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_user_i2c_pci_instance_init(Object *obj) +{ + VHostUserI2CPCI *dev = VHOST_USER_I2C_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_I2C); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_i2c_pci_info = { + .base_name = TYPE_VHOST_USER_I2C_PCI, + .non_transitional_name = "vhost-user-i2c-pci", + .instance_size = sizeof(VHostUserI2CPCI), + .instance_init = vhost_user_i2c_pci_instance_init, + .class_init = vhost_user_i2c_pci_class_init, +}; + +static void vhost_user_i2c_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_i2c_pci_info); +} + +type_init(vhost_user_i2c_pci_register); diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c new file mode 100644 index 00000000..dc5c828b --- /dev/null +++ b/hw/virtio/vhost-user-i2c.c @@ -0,0 +1,287 @@ +/* + * Vhost-user i2c virtio device + * + * Copyright (c) 2021 Viresh Kumar <viresh.kumar@linaro.org> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/vhost-user-i2c.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/virtio_ids.h" + +static const int feature_bits[] = { + VIRTIO_I2C_F_ZERO_LENGTH_REQUEST, + VIRTIO_F_RING_RESET, + VHOST_INVALID_FEATURE_BIT +}; + +static void vu_i2c_start(VirtIODevice *vdev) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + int ret, i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return; + } + + ret = vhost_dev_enable_notifiers(&i2c->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return; + } + + ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + i2c->vhost_dev.acked_features = vdev->guest_features; + + ret = vhost_dev_start(&i2c->vhost_dev, vdev, true); + if (ret < 0) { + error_report("Error starting vhost-user-i2c: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < i2c->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&i2c->vhost_dev, vdev, i, false); + } + + return; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev); +} + +static void vu_i2c_stop(VirtIODevice *vdev) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&i2c->vhost_dev, vdev, true); + + ret = k->set_guest_notifiers(qbus->parent, i2c->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&i2c->vhost_dev, vdev); +} + +static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + bool should_start = virtio_device_should_start(vdev, status); + + if (vhost_dev_is_started(&i2c->vhost_dev) == should_start) { + return; + } + + if (should_start) { + vu_i2c_start(vdev); + } else { + vu_i2c_stop(vdev); + } +} + +static uint64_t vu_i2c_get_features(VirtIODevice *vdev, + uint64_t requested_features, Error **errp) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + virtio_add_feature(&requested_features, VIRTIO_I2C_F_ZERO_LENGTH_REQUEST); + return vhost_get_features(&i2c->vhost_dev, feature_bits, requested_features); +} + +static void vu_i2c_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* + * Not normally called; it's the daemon that handles the queue; + * however virtio's cleanup path can call this. + */ +} + +static void vu_i2c_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + vhost_virtqueue_mask(&i2c->vhost_dev, vdev, idx, mask); +} + +static bool vu_i2c_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + return vhost_virtqueue_pending(&i2c->vhost_dev, idx); +} + +static void do_vhost_user_cleanup(VirtIODevice *vdev, VHostUserI2C *i2c) +{ + vhost_user_cleanup(&i2c->vhost_user); + virtio_delete_queue(i2c->vq); + virtio_cleanup(vdev); + g_free(i2c->vhost_dev.vqs); + i2c->vhost_dev.vqs = NULL; +} + +static int vu_i2c_connect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + if (i2c->connected) { + return 0; + } + i2c->connected = true; + + /* restore vhost state */ + if (virtio_device_started(vdev, vdev->status)) { + vu_i2c_start(vdev); + } + + return 0; +} + +static void vu_i2c_disconnect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + if (!i2c->connected) { + return; + } + i2c->connected = false; + + if (vhost_dev_is_started(&i2c->vhost_dev)) { + vu_i2c_stop(vdev); + } +} + +static void vu_i2c_event(void *opaque, QEMUChrEvent event) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + switch (event) { + case CHR_EVENT_OPENED: + if (vu_i2c_connect(dev) < 0) { + qemu_chr_fe_disconnect(&i2c->chardev); + return; + } + break; + case CHR_EVENT_CLOSED: + vu_i2c_disconnect(dev); + break; + case CHR_EVENT_BREAK: + case CHR_EVENT_MUX_IN: + case CHR_EVENT_MUX_OUT: + /* Ignore */ + break; + } +} + +static void vu_i2c_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(dev); + int ret; + + if (!i2c->chardev.chr) { + error_setg(errp, "vhost-user-i2c: missing chardev"); + return; + } + + if (!vhost_user_init(&i2c->vhost_user, &i2c->chardev, errp)) { + return; + } + + virtio_init(vdev, VIRTIO_ID_I2C_ADAPTER, 0); + + i2c->vhost_dev.nvqs = 1; + i2c->vq = virtio_add_queue(vdev, 4, vu_i2c_handle_output); + i2c->vhost_dev.vqs = g_new0(struct vhost_virtqueue, i2c->vhost_dev.nvqs); + + ret = vhost_dev_init(&i2c->vhost_dev, &i2c->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + do_vhost_user_cleanup(vdev, i2c); + } + + qemu_chr_fe_set_handlers(&i2c->chardev, NULL, NULL, vu_i2c_event, NULL, + dev, NULL, true); +} + +static void vu_i2c_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserI2C *i2c = VHOST_USER_I2C(dev); + + /* This will stop vhost backend if appropriate. */ + vu_i2c_set_status(vdev, 0); + vhost_dev_cleanup(&i2c->vhost_dev); + do_vhost_user_cleanup(vdev, i2c); +} + +static const VMStateDescription vu_i2c_vmstate = { + .name = "vhost-user-i2c", + .unmigratable = 1, +}; + +static Property vu_i2c_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserI2C, chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vu_i2c_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vu_i2c_properties); + dc->vmsd = &vu_i2c_vmstate; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + vdc->realize = vu_i2c_device_realize; + vdc->unrealize = vu_i2c_device_unrealize; + vdc->get_features = vu_i2c_get_features; + vdc->set_status = vu_i2c_set_status; + vdc->guest_notifier_mask = vu_i2c_guest_notifier_mask; + vdc->guest_notifier_pending = vu_i2c_guest_notifier_pending; +} + +static const TypeInfo vu_i2c_info = { + .name = TYPE_VHOST_USER_I2C, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostUserI2C), + .class_init = vu_i2c_class_init, +}; + +static void vu_i2c_register_types(void) +{ + type_register_static(&vu_i2c_info); +} + +type_init(vu_i2c_register_types) diff --git a/hw/virtio/vhost-user-input-pci.c b/hw/virtio/vhost-user-input-pci.c new file mode 100644 index 00000000..b858898a --- /dev/null +++ b/hw/virtio/vhost-user-input-pci.c @@ -0,0 +1,50 @@ +/* + * This work is licensed under the terms of the GNU LGPL, version 2 or + * later. See the COPYING.LIB file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-input.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VHostUserInputPCI VHostUserInputPCI; + +#define TYPE_VHOST_USER_INPUT_PCI "vhost-user-input-pci" + +DECLARE_INSTANCE_CHECKER(VHostUserInputPCI, VHOST_USER_INPUT_PCI, + TYPE_VHOST_USER_INPUT_PCI) + +struct VHostUserInputPCI { + VirtIOPCIProxy parent_obj; + VHostUserInput vhi; +}; + +static void vhost_user_input_pci_instance_init(Object *obj) +{ + VHostUserInputPCI *dev = VHOST_USER_INPUT_PCI(obj); + + virtio_instance_init_common(obj, &dev->vhi, sizeof(dev->vhi), + TYPE_VHOST_USER_INPUT); + + object_property_add_alias(obj, "chardev", + OBJECT(&dev->vhi), "chardev"); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_input_pci_info = { + .generic_name = TYPE_VHOST_USER_INPUT_PCI, + .parent = TYPE_VIRTIO_INPUT_PCI, + .instance_size = sizeof(VHostUserInputPCI), + .instance_init = vhost_user_input_pci_instance_init, +}; + +static void vhost_user_input_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_input_pci_info); +} + +type_init(vhost_user_input_pci_register) diff --git a/hw/virtio/vhost-user-rng-pci.c b/hw/virtio/vhost-user-rng-pci.c new file mode 100644 index 00000000..f6493545 --- /dev/null +++ b/hw/virtio/vhost-user-rng-pci.c @@ -0,0 +1,79 @@ +/* + * Vhost-user RNG virtio device PCI glue + * + * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org> + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-rng.h" +#include "hw/virtio/virtio-pci.h" + +struct VHostUserRNGPCI { + VirtIOPCIProxy parent_obj; + VHostUserRNG vdev; +}; + +typedef struct VHostUserRNGPCI VHostUserRNGPCI; + +#define TYPE_VHOST_USER_RNG_PCI "vhost-user-rng-pci-base" + +DECLARE_INSTANCE_CHECKER(VHostUserRNGPCI, VHOST_USER_RNG_PCI, + TYPE_VHOST_USER_RNG_PCI) + +static Property vhost_user_rng_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_rng_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_rng_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + device_class_set_props(dc, vhost_user_rng_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */ + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_OTHERS; +} + +static void vhost_user_rng_pci_instance_init(Object *obj) +{ + VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_RNG); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_rng_pci_info = { + .base_name = TYPE_VHOST_USER_RNG_PCI, + .non_transitional_name = "vhost-user-rng-pci", + .instance_size = sizeof(VHostUserRNGPCI), + .instance_init = vhost_user_rng_pci_instance_init, + .class_init = vhost_user_rng_pci_class_init, +}; + +static void vhost_user_rng_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_rng_pci_info); +} + +type_init(vhost_user_rng_pci_register); diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c new file mode 100644 index 00000000..201a39e2 --- /dev/null +++ b/hw/virtio/vhost-user-rng.c @@ -0,0 +1,299 @@ +/* + * Vhost-user RNG virtio device + * + * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org> + * + * Implementation seriously tailored on vhost-user-i2c.c + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/vhost-user-rng.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/virtio_ids.h" + +static const int feature_bits[] = { + VIRTIO_F_RING_RESET, + VHOST_INVALID_FEATURE_BIT +}; + +static void vu_rng_start(VirtIODevice *vdev) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + int i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return; + } + + ret = vhost_dev_enable_notifiers(&rng->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return; + } + + ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + rng->vhost_dev.acked_features = vdev->guest_features; + ret = vhost_dev_start(&rng->vhost_dev, vdev, true); + if (ret < 0) { + error_report("Error starting vhost-user-rng: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < rng->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&rng->vhost_dev, vdev, i, false); + } + + return; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&rng->vhost_dev, vdev); +} + +static void vu_rng_stop(VirtIODevice *vdev) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&rng->vhost_dev, vdev, true); + + ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&rng->vhost_dev, vdev); +} + +static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + bool should_start = virtio_device_should_start(vdev, status); + + if (vhost_dev_is_started(&rng->vhost_dev) == should_start) { + return; + } + + if (should_start) { + vu_rng_start(vdev); + } else { + vu_rng_stop(vdev); + } +} + +static uint64_t vu_rng_get_features(VirtIODevice *vdev, + uint64_t requested_features, Error **errp) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + + return vhost_get_features(&rng->vhost_dev, feature_bits, + requested_features); +} + +static void vu_rng_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* + * Not normally called; it's the daemon that handles the queue; + * however virtio's cleanup path can call this. + */ +} + +static void vu_rng_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + + vhost_virtqueue_mask(&rng->vhost_dev, vdev, idx, mask); +} + +static bool vu_rng_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + + return vhost_virtqueue_pending(&rng->vhost_dev, idx); +} + +static void vu_rng_connect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + + if (rng->connected) { + return; + } + + rng->connected = true; + + /* restore vhost state */ + if (virtio_device_started(vdev, vdev->status)) { + vu_rng_start(vdev); + } +} + +static void vu_rng_disconnect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + + if (!rng->connected) { + return; + } + + rng->connected = false; + + if (vhost_dev_is_started(&rng->vhost_dev)) { + vu_rng_stop(vdev); + } +} + +static void vu_rng_event(void *opaque, QEMUChrEvent event) +{ + DeviceState *dev = opaque; + + switch (event) { + case CHR_EVENT_OPENED: + vu_rng_connect(dev); + break; + case CHR_EVENT_CLOSED: + vu_rng_disconnect(dev); + break; + case CHR_EVENT_BREAK: + case CHR_EVENT_MUX_IN: + case CHR_EVENT_MUX_OUT: + /* Ignore */ + break; + } +} + +static void vu_rng_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserRNG *rng = VHOST_USER_RNG(dev); + int ret; + + if (!rng->chardev.chr) { + error_setg(errp, "missing chardev"); + return; + } + + if (!vhost_user_init(&rng->vhost_user, &rng->chardev, errp)) { + return; + } + + virtio_init(vdev, VIRTIO_ID_RNG, 0); + + rng->req_vq = virtio_add_queue(vdev, 4, vu_rng_handle_output); + if (!rng->req_vq) { + error_setg_errno(errp, -1, "virtio_add_queue() failed"); + goto virtio_add_queue_failed; + } + + rng->vhost_dev.nvqs = 1; + rng->vhost_dev.vqs = g_new0(struct vhost_virtqueue, rng->vhost_dev.nvqs); + ret = vhost_dev_init(&rng->vhost_dev, &rng->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + error_setg_errno(errp, -ret, "vhost_dev_init() failed"); + goto vhost_dev_init_failed; + } + + qemu_chr_fe_set_handlers(&rng->chardev, NULL, NULL, vu_rng_event, NULL, + dev, NULL, true); + + return; + +vhost_dev_init_failed: + virtio_delete_queue(rng->req_vq); +virtio_add_queue_failed: + virtio_cleanup(vdev); + vhost_user_cleanup(&rng->vhost_user); +} + +static void vu_rng_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserRNG *rng = VHOST_USER_RNG(dev); + + vu_rng_set_status(vdev, 0); + + vhost_dev_cleanup(&rng->vhost_dev); + g_free(rng->vhost_dev.vqs); + rng->vhost_dev.vqs = NULL; + virtio_delete_queue(rng->req_vq); + virtio_cleanup(vdev); + vhost_user_cleanup(&rng->vhost_user); +} + +static struct vhost_dev *vu_rng_get_vhost(VirtIODevice *vdev) +{ + VHostUserRNG *rng = VHOST_USER_RNG(vdev); + return &rng->vhost_dev; +} + +static const VMStateDescription vu_rng_vmstate = { + .name = "vhost-user-rng", + .unmigratable = 1, +}; + +static Property vu_rng_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserRNG, chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vu_rng_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vu_rng_properties); + dc->vmsd = &vu_rng_vmstate; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + + vdc->realize = vu_rng_device_realize; + vdc->unrealize = vu_rng_device_unrealize; + vdc->get_features = vu_rng_get_features; + vdc->set_status = vu_rng_set_status; + vdc->guest_notifier_mask = vu_rng_guest_notifier_mask; + vdc->guest_notifier_pending = vu_rng_guest_notifier_pending; + vdc->get_vhost = vu_rng_get_vhost; +} + +static const TypeInfo vu_rng_info = { + .name = TYPE_VHOST_USER_RNG, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostUserRNG), + .class_init = vu_rng_class_init, +}; + +static void vu_rng_register_types(void) +{ + type_register_static(&vu_rng_info); +} + +type_init(vu_rng_register_types) diff --git a/hw/virtio/vhost-user-scsi-pci.c b/hw/virtio/vhost-user-scsi-pci.c new file mode 100644 index 00000000..75882e3c --- /dev/null +++ b/hw/virtio/vhost-user-scsi-pci.c @@ -0,0 +1,110 @@ +/* + * Vhost user scsi PCI Bindings + * + * Copyright (c) 2016 Nutanix Inc. All rights reserved. + * + * Author: + * Felipe Franciosi <felipe@nutanix.com> + * + * This work is largely based on the "vhost-scsi" implementation by: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Nicholas Bellinger <nab@risingtidesystems.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "standard-headers/linux/virtio_pci.h" +#include "hw/virtio/vhost-user-scsi.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-scsi.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/loader.h" +#include "sysemu/kvm.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VHostUserSCSIPCI VHostUserSCSIPCI; + +#define TYPE_VHOST_USER_SCSI_PCI "vhost-user-scsi-pci-base" +DECLARE_INSTANCE_CHECKER(VHostUserSCSIPCI, VHOST_USER_SCSI_PCI, + TYPE_VHOST_USER_SCSI_PCI) + +struct VHostUserSCSIPCI { + VirtIOPCIProxy parent_obj; + VHostUserSCSI vdev; +}; + +static Property vhost_user_scsi_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIOSCSIConf *conf = &dev->vdev.parent_obj.parent_obj.conf; + + if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { + conf->num_queues = + virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED); + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_scsi_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_scsi_pci_realize; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, vhost_user_scsi_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; +} + +static void vhost_user_scsi_pci_instance_init(Object *obj) +{ + VHostUserSCSIPCI *dev = VHOST_USER_SCSI_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_SCSI); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_scsi_pci_info = { + .base_name = TYPE_VHOST_USER_SCSI_PCI, + .generic_name = "vhost-user-scsi-pci", + .transitional_name = "vhost-user-scsi-pci-transitional", + .non_transitional_name = "vhost-user-scsi-pci-non-transitional", + .instance_size = sizeof(VHostUserSCSIPCI), + .instance_init = vhost_user_scsi_pci_instance_init, + .class_init = vhost_user_scsi_pci_class_init, +}; + +static void vhost_user_scsi_pci_register(void) +{ + virtio_pci_types_register(&vhost_user_scsi_pci_info); +} + +type_init(vhost_user_scsi_pci_register) diff --git a/hw/virtio/vhost-user-vsock-pci.c b/hw/virtio/vhost-user-vsock-pci.c new file mode 100644 index 00000000..e5a86e80 --- /dev/null +++ b/hw/virtio/vhost-user-vsock-pci.c @@ -0,0 +1,86 @@ +/* + * Vhost-user vsock PCI Bindings + * + * Copyright 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-user-vsock.h" +#include "qom/object.h" + +typedef struct VHostUserVSockPCI VHostUserVSockPCI; + +/* + * vhost-user-vsock-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VHOST_USER_VSOCK_PCI "vhost-user-vsock-pci-base" +DECLARE_INSTANCE_CHECKER(VHostUserVSockPCI, VHOST_USER_VSOCK_PCI, + TYPE_VHOST_USER_VSOCK_PCI) + +struct VHostUserVSockPCI { + VirtIOPCIProxy parent_obj; + VHostUserVSock vdev; +}; + +/* vhost-user-vsock-pci */ + +static Property vhost_user_vsock_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_user_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + /* unlike vhost-vsock, we do not need to care about pre-5.1 compat */ + virtio_pci_force_virtio_1(vpci_dev); + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_user_vsock_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_user_vsock_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, vhost_user_vsock_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_user_vsock_pci_instance_init(Object *obj) +{ + VHostUserVSockPCI *dev = VHOST_USER_VSOCK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_USER_VSOCK); +} + +static const VirtioPCIDeviceTypeInfo vhost_user_vsock_pci_info = { + .base_name = TYPE_VHOST_USER_VSOCK_PCI, + .generic_name = "vhost-user-vsock-pci", + .non_transitional_name = "vhost-user-vsock-pci-non-transitional", + .instance_size = sizeof(VHostUserVSockPCI), + .instance_init = vhost_user_vsock_pci_instance_init, + .class_init = vhost_user_vsock_pci_class_init, +}; + +static void virtio_pci_vhost_register(void) +{ + virtio_pci_types_register(&vhost_user_vsock_pci_info); +} + +type_init(virtio_pci_vhost_register) diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c new file mode 100644 index 00000000..9431b979 --- /dev/null +++ b/hw/virtio/vhost-user-vsock.c @@ -0,0 +1,180 @@ +/* + * Vhost-user vsock virtio device + * + * Copyright 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-properties-system.h" +#include "hw/virtio/vhost-user-vsock.h" + +static const int user_feature_bits[] = { + VIRTIO_F_VERSION_1, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_F_NOTIFY_ON_EMPTY, + VHOST_INVALID_FEATURE_BIT +}; + +static void vuv_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostUserVSock *vsock = VHOST_USER_VSOCK(vdev); + + memcpy(config, &vsock->vsockcfg, sizeof(struct virtio_vsock_config)); +} + +static int vuv_handle_config_change(struct vhost_dev *dev) +{ + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev->vdev); + Error *local_err = NULL; + int ret = vhost_dev_get_config(dev, (uint8_t *)&vsock->vsockcfg, + sizeof(struct virtio_vsock_config), + &local_err); + if (ret < 0) { + error_report_err(local_err); + return -1; + } + + virtio_notify_config(dev->vdev); + + return 0; +} + +const VhostDevConfigOps vsock_ops = { + .vhost_dev_config_notifier = vuv_handle_config_change, +}; + +static void vuv_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + bool should_start = virtio_device_should_start(vdev, status); + + if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) { + return; + } + + if (should_start) { + int ret = vhost_vsock_common_start(vdev); + if (ret < 0) { + return; + } + } else { + vhost_vsock_common_stop(vdev); + } +} + +static uint64_t vuv_get_features(VirtIODevice *vdev, + uint64_t features, + Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + features = vhost_get_features(&vvc->vhost_dev, user_feature_bits, features); + + return vhost_vsock_common_get_features(vdev, features, errp); +} + +static const VMStateDescription vuv_vmstate = { + .name = "vhost-user-vsock", + .unmigratable = 1, +}; + +static void vuv_device_realize(DeviceState *dev, Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev); + int ret; + + if (!vsock->conf.chardev.chr) { + error_setg(errp, "missing chardev"); + return; + } + + if (!vhost_user_init(&vsock->vhost_user, &vsock->conf.chardev, errp)) { + return; + } + + vhost_vsock_common_realize(vdev); + + vhost_dev_set_config_notifier(&vvc->vhost_dev, &vsock_ops); + + ret = vhost_dev_init(&vvc->vhost_dev, &vsock->vhost_user, + VHOST_BACKEND_TYPE_USER, 0, errp); + if (ret < 0) { + goto err_virtio; + } + + ret = vhost_dev_get_config(&vvc->vhost_dev, (uint8_t *)&vsock->vsockcfg, + sizeof(struct virtio_vsock_config), errp); + if (ret < 0) { + goto err_vhost_dev; + } + + return; + +err_vhost_dev: + vhost_dev_cleanup(&vvc->vhost_dev); +err_virtio: + vhost_vsock_common_unrealize(vdev); + vhost_user_cleanup(&vsock->vhost_user); + return; +} + +static void vuv_device_unrealize(DeviceState *dev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserVSock *vsock = VHOST_USER_VSOCK(dev); + + /* This will stop vhost backend if appropriate. */ + vuv_set_status(vdev, 0); + + vhost_dev_cleanup(&vvc->vhost_dev); + + vhost_vsock_common_unrealize(vdev); + + vhost_user_cleanup(&vsock->vhost_user); + +} + +static Property vuv_properties[] = { + DEFINE_PROP_CHR("chardev", VHostUserVSock, conf.chardev), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vuv_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vuv_properties); + dc->vmsd = &vuv_vmstate; + vdc->realize = vuv_device_realize; + vdc->unrealize = vuv_device_unrealize; + vdc->get_features = vuv_get_features; + vdc->get_config = vuv_get_config; + vdc->set_status = vuv_set_status; +} + +static const TypeInfo vuv_info = { + .name = TYPE_VHOST_USER_VSOCK, + .parent = TYPE_VHOST_VSOCK_COMMON, + .instance_size = sizeof(VHostUserVSock), + .class_init = vuv_class_init, +}; + +static void vuv_register_types(void) +{ + type_register_static(&vuv_info); +} + +type_init(vuv_register_types) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c new file mode 100644 index 00000000..8f635844 --- /dev/null +++ b/hw/virtio/vhost-user.c @@ -0,0 +1,2800 @@ +/* + * vhost-user + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-user.h" +#include "hw/virtio/vhost-backend.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-net.h" +#include "chardev/char-fe.h" +#include "io/channel-socket.h" +#include "sysemu/kvm.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qemu/sockets.h" +#include "sysemu/runstate.h" +#include "sysemu/cryptodev.h" +#include "migration/migration.h" +#include "migration/postcopy-ram.h" +#include "trace.h" +#include "exec/ramblock.h" + +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "standard-headers/linux/vhost_types.h" + +#ifdef CONFIG_LINUX +#include <linux/userfaultfd.h> +#endif + +#define VHOST_MEMORY_BASELINE_NREGIONS 8 +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_USER_SLAVE_MAX_FDS 8 + +/* + * Set maximum number of RAM slots supported to + * the maximum number supported by the target + * hardware plaform. + */ +#if defined(TARGET_X86) || defined(TARGET_X86_64) || \ + defined(TARGET_ARM) || defined(TARGET_ARM_64) +#include "hw/acpi/acpi.h" +#define VHOST_USER_MAX_RAM_SLOTS ACPI_MAX_RAM_SLOTS + +#elif defined(TARGET_PPC) || defined(TARGET_PPC64) +#include "hw/ppc/spapr.h" +#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS + +#else +#define VHOST_USER_MAX_RAM_SLOTS 512 +#endif + +/* + * Maximum size of virtio device config space + */ +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, + /* Feature 14 reserved for VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS. */ + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + VHOST_USER_PROTOCOL_F_STATUS = 16, + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_SET_SLAVE_REQ_FD = 21, + VHOST_USER_IOTLB_MSG = 22, + VHOST_USER_SET_VRING_ENDIAN = 23, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, + VHOST_USER_GET_INFLIGHT_FD = 31, + VHOST_USER_SET_INFLIGHT_FD = 32, + VHOST_USER_GPU_SET_SOCKET = 33, + VHOST_USER_RESET_DEVICE = 34, + /* Message number 35 reserved for VHOST_USER_VRING_KICK. */ + VHOST_USER_GET_MAX_MEM_SLOTS = 36, + VHOST_USER_ADD_MEM_REG = 37, + VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_SET_STATUS = 39, + VHOST_USER_GET_STATUS = 40, + VHOST_USER_MAX +} VhostUserRequest; + +typedef enum VhostUserSlaveRequest { + VHOST_USER_SLAVE_NONE = 0, + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, + VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_MAX +} VhostUserSlaveRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMemRegMsg { + uint64_t padding; + VhostUserMemoryRegion region; +} VhostUserMemRegMsg; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserConfig { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +} VhostUserConfig; + +#define VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN 512 +#define VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN 64 + +typedef struct VhostUserCryptoSession { + /* session id for success, -1 on errors */ + int64_t session_id; + CryptoDevBackendSymSessionInfo session_setup_data; + uint8_t key[VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN]; + uint8_t auth_key[VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN]; +} VhostUserCryptoSession; + +static VhostUserConfig c __attribute__ ((unused)); +#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \ + + sizeof(c.size) \ + + sizeof(c.flags)) + +typedef struct VhostUserVringArea { + uint64_t u64; + uint64_t size; + uint64_t offset; +} VhostUserVringArea; + +typedef struct VhostUserInflight { + uint64_t mmap_size; + uint64_t mmap_offset; + uint16_t num_queues; + uint16_t queue_size; +} VhostUserInflight; + +typedef struct { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ +} QEMU_PACKED VhostUserHeader; + +typedef union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserMemRegMsg mem_reg; + VhostUserLog log; + struct vhost_iotlb_msg iotlb; + VhostUserConfig config; + VhostUserCryptoSession session; + VhostUserVringArea area; + VhostUserInflight inflight; +} VhostUserPayload; + +typedef struct VhostUserMsg { + VhostUserHeader hdr; + VhostUserPayload payload; +} QEMU_PACKED VhostUserMsg; + +static VhostUserMsg m __attribute__ ((unused)); +#define VHOST_USER_HDR_SIZE (sizeof(VhostUserHeader)) + +#define VHOST_USER_PAYLOAD_SIZE (sizeof(VhostUserPayload)) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +struct vhost_user { + struct vhost_dev *dev; + /* Shared between vhost devs of the same virtio device */ + VhostUserState *user; + QIOChannel *slave_ioc; + GSource *slave_src; + NotifierWithReturn postcopy_notifier; + struct PostCopyFD postcopy_fd; + uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS]; + /* Length of the region_rb and region_rb_offset arrays */ + size_t region_rb_len; + /* RAMBlock associated with a given region */ + RAMBlock **region_rb; + /* + * The offset from the start of the RAMBlock to the start of the + * vhost region. + */ + ram_addr_t *region_rb_offset; + + /* True once we've entered postcopy_listen */ + bool postcopy_listen; + + /* Our current regions */ + int num_shadow_regions; + struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS]; +}; + +struct scrub_regions { + struct vhost_memory_region *region; + int reg_idx; + int fd_idx; +}; + +static bool ioeventfd_enabled(void) +{ + return !kvm_enabled() || kvm_eventfds_enabled(); +} + +static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg) +{ + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + uint8_t *p = (uint8_t *) msg; + int r, size = VHOST_USER_HDR_SIZE; + + r = qemu_chr_fe_read_all(chr, p, size); + if (r != size) { + int saved_errno = errno; + error_report("Failed to read msg header. Read %d instead of %d." + " Original request %d.", r, size, msg->hdr.request); + return r < 0 ? -saved_errno : -EIO; + } + + /* validate received flags */ + if (msg->hdr.flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) { + error_report("Failed to read msg header." + " Flags 0x%x instead of 0x%x.", msg->hdr.flags, + VHOST_USER_REPLY_MASK | VHOST_USER_VERSION); + return -EPROTO; + } + + trace_vhost_user_read(msg->hdr.request, msg->hdr.flags); + + return 0; +} + +struct vhost_user_read_cb_data { + struct vhost_dev *dev; + VhostUserMsg *msg; + GMainLoop *loop; + int ret; +}; + +static gboolean vhost_user_read_cb(void *do_not_use, GIOCondition condition, + gpointer opaque) +{ + struct vhost_user_read_cb_data *data = opaque; + struct vhost_dev *dev = data->dev; + VhostUserMsg *msg = data->msg; + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + uint8_t *p = (uint8_t *) msg; + int r, size; + + r = vhost_user_read_header(dev, msg); + if (r < 0) { + data->ret = r; + goto end; + } + + /* validate message size is sane */ + if (msg->hdr.size > VHOST_USER_PAYLOAD_SIZE) { + error_report("Failed to read msg header." + " Size %d exceeds the maximum %zu.", msg->hdr.size, + VHOST_USER_PAYLOAD_SIZE); + data->ret = -EPROTO; + goto end; + } + + if (msg->hdr.size) { + p += VHOST_USER_HDR_SIZE; + size = msg->hdr.size; + r = qemu_chr_fe_read_all(chr, p, size); + if (r != size) { + int saved_errno = errno; + error_report("Failed to read msg payload." + " Read %d instead of %d.", r, msg->hdr.size); + data->ret = r < 0 ? -saved_errno : -EIO; + goto end; + } + } + +end: + g_main_loop_quit(data->loop); + return G_SOURCE_REMOVE; +} + +static gboolean slave_read(QIOChannel *ioc, GIOCondition condition, + gpointer opaque); + +/* + * This updates the read handler to use a new event loop context. + * Event sources are removed from the previous context : this ensures + * that events detected in the previous context are purged. They will + * be re-detected and processed in the new context. + */ +static void slave_update_read_handler(struct vhost_dev *dev, + GMainContext *ctxt) +{ + struct vhost_user *u = dev->opaque; + + if (!u->slave_ioc) { + return; + } + + if (u->slave_src) { + g_source_destroy(u->slave_src); + g_source_unref(u->slave_src); + } + + u->slave_src = qio_channel_add_watch_source(u->slave_ioc, + G_IO_IN | G_IO_HUP, + slave_read, dev, NULL, + ctxt); +} + +static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) +{ + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + GMainContext *prev_ctxt = chr->chr->gcontext; + GMainContext *ctxt = g_main_context_new(); + GMainLoop *loop = g_main_loop_new(ctxt, FALSE); + struct vhost_user_read_cb_data data = { + .dev = dev, + .loop = loop, + .msg = msg, + .ret = 0 + }; + + /* + * We want to be able to monitor the slave channel fd while waiting + * for chr I/O. This requires an event loop, but we can't nest the + * one to which chr is currently attached : its fd handlers might not + * be prepared for re-entrancy. So we create a new one and switch chr + * to use it. + */ + slave_update_read_handler(dev, ctxt); + qemu_chr_be_update_read_handlers(chr->chr, ctxt); + qemu_chr_fe_add_watch(chr, G_IO_IN | G_IO_HUP, vhost_user_read_cb, &data); + + g_main_loop_run(loop); + + /* + * Restore the previous event loop context. This also destroys/recreates + * event sources : this guarantees that all pending events in the original + * context that have been processed by the nested loop are purged. + */ + qemu_chr_be_update_read_handlers(chr->chr, prev_ctxt); + slave_update_read_handler(dev, NULL); + + g_main_loop_unref(loop); + g_main_context_unref(ctxt); + + return data.ret; +} + +static int process_message_reply(struct vhost_dev *dev, + const VhostUserMsg *msg) +{ + int ret; + VhostUserMsg msg_reply; + + if ((msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) == 0) { + return 0; + } + + ret = vhost_user_read(dev, &msg_reply); + if (ret < 0) { + return ret; + } + + if (msg_reply.hdr.request != msg->hdr.request) { + error_report("Received unexpected msg type. " + "Expected %d received %d", + msg->hdr.request, msg_reply.hdr.request); + return -EPROTO; + } + + return msg_reply.payload.u64 ? -EIO : 0; +} + +static bool vhost_user_one_time_request(VhostUserRequest request) +{ + switch (request) { + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + case VHOST_USER_SET_MEM_TABLE: + case VHOST_USER_GET_QUEUE_NUM: + case VHOST_USER_NET_SET_MTU: + return true; + default: + return false; + } +} + +/* most non-init callers ignore the error */ +static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, + int *fds, int fd_num) +{ + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + int ret, size = VHOST_USER_HDR_SIZE + msg->hdr.size; + + /* + * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE, + * we just need send it once in the first time. For later such + * request, we just ignore it. + */ + if (vhost_user_one_time_request(msg->hdr.request) && dev->vq_index != 0) { + msg->hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK; + return 0; + } + + if (qemu_chr_fe_set_msgfds(chr, fds, fd_num) < 0) { + error_report("Failed to set msg fds."); + return -EINVAL; + } + + ret = qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size); + if (ret != size) { + int saved_errno = errno; + error_report("Failed to write msg." + " Wrote %d instead of %d.", ret, size); + return ret < 0 ? -saved_errno : -EIO; + } + + trace_vhost_user_write(msg->hdr.request, msg->hdr.flags); + + return 0; +} + +int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_GPU_SET_SOCKET, + .hdr.flags = VHOST_USER_VERSION, + }; + + return vhost_user_write(dev, &msg, &fd, 1); +} + +static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) +{ + int fds[VHOST_USER_MAX_RAM_SLOTS]; + size_t fd_num = 0; + bool shmfd = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_LOG_BASE, + .hdr.flags = VHOST_USER_VERSION, + .payload.log.mmap_size = log->size * sizeof(*(log->log)), + .payload.log.mmap_offset = 0, + .hdr.size = sizeof(msg.payload.log), + }; + + if (shmfd && log->fd != -1) { + fds[fd_num++] = log->fd; + } + + ret = vhost_user_write(dev, &msg, fds, fd_num); + if (ret < 0) { + return ret; + } + + if (shmfd) { + msg.hdr.size = 0; + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + return ret; + } + + if (msg.hdr.request != VHOST_USER_SET_LOG_BASE) { + error_report("Received unexpected msg type. " + "Expected %d received %d", + VHOST_USER_SET_LOG_BASE, msg.hdr.request); + return -EPROTO; + } + } + + return 0; +} + +static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset, + int *fd) +{ + MemoryRegion *mr; + + assert((uintptr_t)addr == addr); + mr = memory_region_from_host((void *)(uintptr_t)addr, offset); + *fd = memory_region_get_fd(mr); + + return mr; +} + +static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst, + struct vhost_memory_region *src, + uint64_t mmap_offset) +{ + assert(src != NULL && dst != NULL); + dst->userspace_addr = src->userspace_addr; + dst->memory_size = src->memory_size; + dst->guest_phys_addr = src->guest_phys_addr; + dst->mmap_offset = mmap_offset; +} + +static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u, + struct vhost_dev *dev, + VhostUserMsg *msg, + int *fds, size_t *fd_num, + bool track_ramblocks) +{ + int i, fd; + ram_addr_t offset; + MemoryRegion *mr; + struct vhost_memory_region *reg; + VhostUserMemoryRegion region_buffer; + + msg->hdr.request = VHOST_USER_SET_MEM_TABLE; + + for (i = 0; i < dev->mem->nregions; ++i) { + reg = dev->mem->regions + i; + + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + if (fd > 0) { + if (track_ramblocks) { + assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS); + trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name, + reg->memory_size, + reg->guest_phys_addr, + reg->userspace_addr, + offset); + u->region_rb_offset[i] = offset; + u->region_rb[i] = mr->ram_block; + } else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) { + error_report("Failed preparing vhost-user memory table msg"); + return -ENOBUFS; + } + vhost_user_fill_msg_region(®ion_buffer, reg, offset); + msg->payload.memory.regions[*fd_num] = region_buffer; + fds[(*fd_num)++] = fd; + } else if (track_ramblocks) { + u->region_rb_offset[i] = 0; + u->region_rb[i] = NULL; + } + } + + msg->payload.memory.nregions = *fd_num; + + if (!*fd_num) { + error_report("Failed initializing vhost-user memory map, " + "consider using -object memory-backend-file share=on"); + return -EINVAL; + } + + msg->hdr.size = sizeof(msg->payload.memory.nregions); + msg->hdr.size += sizeof(msg->payload.memory.padding); + msg->hdr.size += *fd_num * sizeof(VhostUserMemoryRegion); + + return 0; +} + +static inline bool reg_equal(struct vhost_memory_region *shadow_reg, + struct vhost_memory_region *vdev_reg) +{ + return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr && + shadow_reg->userspace_addr == vdev_reg->userspace_addr && + shadow_reg->memory_size == vdev_reg->memory_size; +} + +static void scrub_shadow_regions(struct vhost_dev *dev, + struct scrub_regions *add_reg, + int *nr_add_reg, + struct scrub_regions *rem_reg, + int *nr_rem_reg, uint64_t *shadow_pcb, + bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + bool found[VHOST_USER_MAX_RAM_SLOTS] = {}; + struct vhost_memory_region *reg, *shadow_reg; + int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0; + ram_addr_t offset; + MemoryRegion *mr; + bool matching; + + /* + * Find memory regions present in our shadow state which are not in + * the device's current memory state. + * + * Mark regions in both the shadow and device state as "found". + */ + for (i = 0; i < u->num_shadow_regions; i++) { + shadow_reg = &u->shadow_regions[i]; + matching = false; + + for (j = 0; j < dev->mem->nregions; j++) { + reg = &dev->mem->regions[j]; + + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + + if (reg_equal(shadow_reg, reg)) { + matching = true; + found[j] = true; + if (track_ramblocks) { + /* + * Reset postcopy client bases, region_rb, and + * region_rb_offset in case regions are removed. + */ + if (fd > 0) { + u->region_rb_offset[j] = offset; + u->region_rb[j] = mr->ram_block; + shadow_pcb[j] = u->postcopy_client_bases[i]; + } else { + u->region_rb_offset[j] = 0; + u->region_rb[j] = NULL; + } + } + break; + } + } + + /* + * If the region was not found in the current device memory state + * create an entry for it in the removed list. + */ + if (!matching) { + rem_reg[rm_idx].region = shadow_reg; + rem_reg[rm_idx++].reg_idx = i; + } + } + + /* + * For regions not marked "found", create entries in the added list. + * + * Note their indexes in the device memory state and the indexes of their + * file descriptors. + */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + if (fd > 0) { + ++fd_num; + } + + /* + * If the region was in both the shadow and device state we don't + * need to send a VHOST_USER_ADD_MEM_REG message for it. + */ + if (found[i]) { + continue; + } + + add_reg[add_idx].region = reg; + add_reg[add_idx].reg_idx = i; + add_reg[add_idx++].fd_idx = fd_num; + } + *nr_rem_reg = rm_idx; + *nr_add_reg = add_idx; + + return; +} + +static int send_remove_regions(struct vhost_dev *dev, + struct scrub_regions *remove_reg, + int nr_rem_reg, VhostUserMsg *msg, + bool reply_supported) +{ + struct vhost_user *u = dev->opaque; + struct vhost_memory_region *shadow_reg; + int i, fd, shadow_reg_idx, ret; + ram_addr_t offset; + VhostUserMemoryRegion region_buffer; + + /* + * The regions in remove_reg appear in the same order they do in the + * shadow table. Therefore we can minimize memory copies by iterating + * through remove_reg backwards. + */ + for (i = nr_rem_reg - 1; i >= 0; i--) { + shadow_reg = remove_reg[i].region; + shadow_reg_idx = remove_reg[i].reg_idx; + + vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd); + + if (fd > 0) { + msg->hdr.request = VHOST_USER_REM_MEM_REG; + vhost_user_fill_msg_region(®ion_buffer, shadow_reg, 0); + msg->payload.mem_reg.region = region_buffer; + + ret = vhost_user_write(dev, msg, NULL, 0); + if (ret < 0) { + return ret; + } + + if (reply_supported) { + ret = process_message_reply(dev, msg); + if (ret) { + return ret; + } + } + } + + /* + * At this point we know the backend has unmapped the region. It is now + * safe to remove it from the shadow table. + */ + memmove(&u->shadow_regions[shadow_reg_idx], + &u->shadow_regions[shadow_reg_idx + 1], + sizeof(struct vhost_memory_region) * + (u->num_shadow_regions - shadow_reg_idx - 1)); + u->num_shadow_regions--; + } + + return 0; +} + +static int send_add_regions(struct vhost_dev *dev, + struct scrub_regions *add_reg, int nr_add_reg, + VhostUserMsg *msg, uint64_t *shadow_pcb, + bool reply_supported, bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + int i, fd, ret, reg_idx, reg_fd_idx; + struct vhost_memory_region *reg; + MemoryRegion *mr; + ram_addr_t offset; + VhostUserMsg msg_reply; + VhostUserMemoryRegion region_buffer; + + for (i = 0; i < nr_add_reg; i++) { + reg = add_reg[i].region; + reg_idx = add_reg[i].reg_idx; + reg_fd_idx = add_reg[i].fd_idx; + + mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd); + + if (fd > 0) { + if (track_ramblocks) { + trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name, + reg->memory_size, + reg->guest_phys_addr, + reg->userspace_addr, + offset); + u->region_rb_offset[reg_idx] = offset; + u->region_rb[reg_idx] = mr->ram_block; + } + msg->hdr.request = VHOST_USER_ADD_MEM_REG; + vhost_user_fill_msg_region(®ion_buffer, reg, offset); + msg->payload.mem_reg.region = region_buffer; + + ret = vhost_user_write(dev, msg, &fd, 1); + if (ret < 0) { + return ret; + } + + if (track_ramblocks) { + uint64_t reply_gpa; + + ret = vhost_user_read(dev, &msg_reply); + if (ret < 0) { + return ret; + } + + reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr; + + if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) { + error_report("%s: Received unexpected msg type." + "Expected %d received %d", __func__, + VHOST_USER_ADD_MEM_REG, + msg_reply.hdr.request); + return -EPROTO; + } + + /* + * We're using the same structure, just reusing one of the + * fields, so it should be the same size. + */ + if (msg_reply.hdr.size != msg->hdr.size) { + error_report("%s: Unexpected size for postcopy reply " + "%d vs %d", __func__, msg_reply.hdr.size, + msg->hdr.size); + return -EPROTO; + } + + /* Get the postcopy client base from the backend's reply. */ + if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) { + shadow_pcb[reg_idx] = + msg_reply.payload.mem_reg.region.userspace_addr; + trace_vhost_user_set_mem_table_postcopy( + msg_reply.payload.mem_reg.region.userspace_addr, + msg->payload.mem_reg.region.userspace_addr, + reg_fd_idx, reg_idx); + } else { + error_report("%s: invalid postcopy reply for region. " + "Got guest physical address %" PRIX64 ", expected " + "%" PRIX64, __func__, reply_gpa, + dev->mem->regions[reg_idx].guest_phys_addr); + return -EPROTO; + } + } else if (reply_supported) { + ret = process_message_reply(dev, msg); + if (ret) { + return ret; + } + } + } else if (track_ramblocks) { + u->region_rb_offset[reg_idx] = 0; + u->region_rb[reg_idx] = NULL; + } + + /* + * At this point, we know the backend has mapped in the new + * region, if the region has a valid file descriptor. + * + * The region should now be added to the shadow table. + */ + u->shadow_regions[u->num_shadow_regions].guest_phys_addr = + reg->guest_phys_addr; + u->shadow_regions[u->num_shadow_regions].userspace_addr = + reg->userspace_addr; + u->shadow_regions[u->num_shadow_regions].memory_size = + reg->memory_size; + u->num_shadow_regions++; + } + + return 0; +} + +static int vhost_user_add_remove_regions(struct vhost_dev *dev, + VhostUserMsg *msg, + bool reply_supported, + bool track_ramblocks) +{ + struct vhost_user *u = dev->opaque; + struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS]; + struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS]; + uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {}; + int nr_add_reg, nr_rem_reg; + int ret; + + msg->hdr.size = sizeof(msg->payload.mem_reg); + + /* Find the regions which need to be removed or added. */ + scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg, + shadow_pcb, track_ramblocks); + + if (nr_rem_reg) { + ret = send_remove_regions(dev, rem_reg, nr_rem_reg, msg, + reply_supported); + if (ret < 0) { + goto err; + } + } + + if (nr_add_reg) { + ret = send_add_regions(dev, add_reg, nr_add_reg, msg, shadow_pcb, + reply_supported, track_ramblocks); + if (ret < 0) { + goto err; + } + } + + if (track_ramblocks) { + memcpy(u->postcopy_client_bases, shadow_pcb, + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); + /* + * Now we've registered this with the postcopy code, we ack to the + * client, because now we're in the position to be able to deal with + * any faults it generates. + */ + /* TODO: Use this for failure cases as well with a bad value. */ + msg->hdr.size = sizeof(msg->payload.u64); + msg->payload.u64 = 0; /* OK */ + + ret = vhost_user_write(dev, msg, NULL, 0); + if (ret < 0) { + return ret; + } + } + + return 0; + +err: + if (track_ramblocks) { + memcpy(u->postcopy_client_bases, shadow_pcb, + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); + } + + return ret; +} + +static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, + struct vhost_memory *mem, + bool reply_supported, + bool config_mem_slots) +{ + struct vhost_user *u = dev->opaque; + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + size_t fd_num = 0; + VhostUserMsg msg_reply; + int region_i, msg_i; + int ret; + + VhostUserMsg msg = { + .hdr.flags = VHOST_USER_VERSION, + }; + + if (u->region_rb_len < dev->mem->nregions) { + u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions); + u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset, + dev->mem->nregions); + memset(&(u->region_rb[u->region_rb_len]), '\0', + sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len)); + memset(&(u->region_rb_offset[u->region_rb_len]), '\0', + sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len)); + u->region_rb_len = dev->mem->nregions; + } + + if (config_mem_slots) { + ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, true); + if (ret < 0) { + return ret; + } + } else { + ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + true); + if (ret < 0) { + return ret; + } + + ret = vhost_user_write(dev, &msg, fds, fd_num); + if (ret < 0) { + return ret; + } + + ret = vhost_user_read(dev, &msg_reply); + if (ret < 0) { + return ret; + } + + if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) { + error_report("%s: Received unexpected msg type." + "Expected %d received %d", __func__, + VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request); + return -EPROTO; + } + + /* + * We're using the same structure, just reusing one of the + * fields, so it should be the same size. + */ + if (msg_reply.hdr.size != msg.hdr.size) { + error_report("%s: Unexpected size for postcopy reply " + "%d vs %d", __func__, msg_reply.hdr.size, + msg.hdr.size); + return -EPROTO; + } + + memset(u->postcopy_client_bases, 0, + sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS); + + /* + * They're in the same order as the regions that were sent + * but some of the regions were skipped (above) if they + * didn't have fd's + */ + for (msg_i = 0, region_i = 0; + region_i < dev->mem->nregions; + region_i++) { + if (msg_i < fd_num && + msg_reply.payload.memory.regions[msg_i].guest_phys_addr == + dev->mem->regions[region_i].guest_phys_addr) { + u->postcopy_client_bases[region_i] = + msg_reply.payload.memory.regions[msg_i].userspace_addr; + trace_vhost_user_set_mem_table_postcopy( + msg_reply.payload.memory.regions[msg_i].userspace_addr, + msg.payload.memory.regions[msg_i].userspace_addr, + msg_i, region_i); + msg_i++; + } + } + if (msg_i != fd_num) { + error_report("%s: postcopy reply not fully consumed " + "%d vs %zd", + __func__, msg_i, fd_num); + return -EIO; + } + + /* + * Now we've registered this with the postcopy code, we ack to the + * client, because now we're in the position to be able to deal + * with any faults it generates. + */ + /* TODO: Use this for failure cases as well with a bad value. */ + msg.hdr.size = sizeof(msg.payload.u64); + msg.payload.u64 = 0; /* OK */ + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +static int vhost_user_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + struct vhost_user *u = dev->opaque; + int fds[VHOST_MEMORY_BASELINE_NREGIONS]; + size_t fd_num = 0; + bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + bool config_mem_slots = + virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS); + int ret; + + if (do_postcopy) { + /* + * Postcopy has enough differences that it's best done in it's own + * version + */ + return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported, + config_mem_slots); + } + + VhostUserMsg msg = { + .hdr.flags = VHOST_USER_VERSION, + }; + + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + if (config_mem_slots) { + ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, false); + if (ret < 0) { + return ret; + } + } else { + ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num, + false); + if (ret < 0) { + return ret; + } + + ret = vhost_user_write(dev, &msg, fds, fd_num); + if (ret < 0) { + return ret; + } + + if (reply_supported) { + return process_message_reply(dev, &msg); + } + } + + return 0; +} + +static int vhost_user_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + bool cross_endian = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN); + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_VRING_ENDIAN, + .hdr.flags = VHOST_USER_VERSION, + .payload.state = *ring, + .hdr.size = sizeof(msg.payload.state), + }; + + if (!cross_endian) { + error_report("vhost-user trying to send unhandled ioctl"); + return -ENOTSUP; + } + + return vhost_user_write(dev, &msg, NULL, 0); +} + +static int vhost_set_vring(struct vhost_dev *dev, + unsigned long int request, + struct vhost_vring_state *ring) +{ + VhostUserMsg msg = { + .hdr.request = request, + .hdr.flags = VHOST_USER_VERSION, + .payload.state = *ring, + .hdr.size = sizeof(msg.payload.state), + }; + + return vhost_user_write(dev, &msg, NULL, 0); +} + +static int vhost_user_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring); +} + +static void vhost_user_host_notifier_free(VhostUserHostNotifier *n) +{ + assert(n && n->unmap_addr); + munmap(n->unmap_addr, qemu_real_host_page_size()); + n->unmap_addr = NULL; +} + +/* + * clean-up function for notifier, will finally free the structure + * under rcu. + */ +static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n, + VirtIODevice *vdev) +{ + if (n->addr) { + if (vdev) { + virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false); + } + assert(!n->unmap_addr); + n->unmap_addr = n->addr; + n->addr = NULL; + call_rcu(n, vhost_user_host_notifier_free, rcu); + } +} + +static int vhost_user_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring); +} + +static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) +{ + int i; + + if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) { + return -EINVAL; + } + + for (i = 0; i < dev->nvqs; ++i) { + int ret; + struct vhost_vring_state state = { + .index = dev->vq_index + i, + .num = enable, + }; + + ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state); + if (ret < 0) { + /* + * Restoring the previous state is likely infeasible, as well as + * proceeding regardless the error, so just bail out and hope for + * the device-level recovery. + */ + return ret; + } + } + + return 0; +} + +static VhostUserHostNotifier *fetch_notifier(VhostUserState *u, + int idx) +{ + if (idx >= u->notifiers->len) { + return NULL; + } + return g_ptr_array_index(u->notifiers, idx); +} + +static int vhost_user_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_GET_VRING_BASE, + .hdr.flags = VHOST_USER_VERSION, + .payload.state = *ring, + .hdr.size = sizeof(msg.payload.state), + }; + struct vhost_user *u = dev->opaque; + + VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index); + if (n) { + vhost_user_host_notifier_remove(n, dev->vdev); + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + return ret; + } + + if (msg.hdr.request != VHOST_USER_GET_VRING_BASE) { + error_report("Received unexpected msg type. Expected %d received %d", + VHOST_USER_GET_VRING_BASE, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size != sizeof(msg.payload.state)) { + error_report("Received bad msg size."); + return -EPROTO; + } + + *ring = msg.payload.state; + + return 0; +} + +static int vhost_set_vring_file(struct vhost_dev *dev, + VhostUserRequest request, + struct vhost_vring_file *file) +{ + int fds[VHOST_USER_MAX_RAM_SLOTS]; + size_t fd_num = 0; + VhostUserMsg msg = { + .hdr.request = request, + .hdr.flags = VHOST_USER_VERSION, + .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK, + .hdr.size = sizeof(msg.payload.u64), + }; + + if (ioeventfd_enabled() && file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + + return vhost_user_write(dev, &msg, fds, fd_num); +} + +static int vhost_user_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file); +} + +static int vhost_user_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file); +} + +static int vhost_user_set_vring_err(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_ERR, file); +} + +static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) +{ + int ret; + VhostUserMsg msg = { + .hdr.request = request, + .hdr.flags = VHOST_USER_VERSION, + }; + + if (vhost_user_one_time_request(request) && dev->vq_index != 0) { + return 0; + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + return ret; + } + + if (msg.hdr.request != request) { + error_report("Received unexpected msg type. Expected %d received %d", + request, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size != sizeof(msg.payload.u64)) { + error_report("Received bad msg size."); + return -EPROTO; + } + + *u64 = msg.payload.u64; + + return 0; +} + +static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features) +{ + if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) { + return -EPROTO; + } + + return 0; +} + +static int enforce_reply(struct vhost_dev *dev, + const VhostUserMsg *msg) +{ + uint64_t dummy; + + if (msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) { + return process_message_reply(dev, msg); + } + + /* + * We need to wait for a reply but the backend does not + * support replies for the command we just sent. + * Send VHOST_USER_GET_FEATURES which makes all backends + * send a reply. + */ + return vhost_user_get_features(dev, &dummy); +} + +static int vhost_user_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_VRING_ADDR, + .hdr.flags = VHOST_USER_VERSION, + .payload.addr = *addr, + .hdr.size = sizeof(msg.payload.addr), + }; + + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + + /* + * wait for a reply if logging is enabled to make sure + * backend is actually logging changes + */ + bool wait_for_reply = addr->flags & (1 << VHOST_VRING_F_LOG); + + if (reply_supported && wait_for_reply) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + if (wait_for_reply) { + return enforce_reply(dev, &msg); + } + + return 0; +} + +static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64, + bool wait_for_reply) +{ + VhostUserMsg msg = { + .hdr.request = request, + .hdr.flags = VHOST_USER_VERSION, + .payload.u64 = u64, + .hdr.size = sizeof(msg.payload.u64), + }; + int ret; + + if (wait_for_reply) { + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + if (wait_for_reply) { + return enforce_reply(dev, &msg); + } + + return 0; +} + +static int vhost_user_set_status(struct vhost_dev *dev, uint8_t status) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_STATUS, status, false); +} + +static int vhost_user_get_status(struct vhost_dev *dev, uint8_t *status) +{ + uint64_t value; + int ret; + + ret = vhost_user_get_u64(dev, VHOST_USER_GET_STATUS, &value); + if (ret < 0) { + return ret; + } + *status = value; + + return 0; +} + +static int vhost_user_add_status(struct vhost_dev *dev, uint8_t status) +{ + uint8_t s; + int ret; + + ret = vhost_user_get_status(dev, &s); + if (ret < 0) { + return ret; + } + + if ((s & status) == status) { + return 0; + } + s |= status; + + return vhost_user_set_status(dev, s); +} + +static int vhost_user_set_features(struct vhost_dev *dev, + uint64_t features) +{ + /* + * wait for a reply if logging is enabled to make sure + * backend is actually logging changes + */ + bool log_enabled = features & (0x1ULL << VHOST_F_LOG_ALL); + int ret; + + /* + * We need to include any extra backend only feature bits that + * might be needed by our device. Currently this includes the + * VHOST_USER_F_PROTOCOL_FEATURES bit for enabling protocol + * features. + */ + ret = vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES, + features | dev->backend_features, + log_enabled); + + if (virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_STATUS)) { + if (!ret) { + return vhost_user_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + } + } + + return ret; +} + +static int vhost_user_set_protocol_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features, + false); +} + +static int vhost_user_set_owner(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_OWNER, + .hdr.flags = VHOST_USER_VERSION, + }; + + return vhost_user_write(dev, &msg, NULL, 0); +} + +static int vhost_user_get_max_memslots(struct vhost_dev *dev, + uint64_t *max_memslots) +{ + uint64_t backend_max_memslots; + int err; + + err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS, + &backend_max_memslots); + if (err < 0) { + return err; + } + + *max_memslots = backend_max_memslots; + + return 0; +} + +static int vhost_user_reset_device(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .hdr.flags = VHOST_USER_VERSION, + }; + + msg.hdr.request = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RESET_DEVICE) + ? VHOST_USER_RESET_DEVICE + : VHOST_USER_RESET_OWNER; + + return vhost_user_write(dev, &msg, NULL, 0); +} + +static int vhost_user_slave_handle_config_change(struct vhost_dev *dev) +{ + if (!dev->config_ops || !dev->config_ops->vhost_dev_config_notifier) { + return -ENOSYS; + } + + return dev->config_ops->vhost_dev_config_notifier(dev); +} + +/* + * Fetch or create the notifier for a given idx. Newly created + * notifiers are added to the pointer array that tracks them. + */ +static VhostUserHostNotifier *fetch_or_create_notifier(VhostUserState *u, + int idx) +{ + VhostUserHostNotifier *n = NULL; + if (idx >= u->notifiers->len) { + g_ptr_array_set_size(u->notifiers, idx + 1); + } + + n = g_ptr_array_index(u->notifiers, idx); + if (!n) { + /* + * In case notification arrive out-of-order, + * make room for current index. + */ + g_ptr_array_remove_index(u->notifiers, idx); + n = g_new0(VhostUserHostNotifier, 1); + n->idx = idx; + g_ptr_array_insert(u->notifiers, idx, n); + trace_vhost_user_create_notifier(idx, n); + } + + return n; +} + +static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, + VhostUserVringArea *area, + int fd) +{ + int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK; + size_t page_size = qemu_real_host_page_size(); + struct vhost_user *u = dev->opaque; + VhostUserState *user = u->user; + VirtIODevice *vdev = dev->vdev; + VhostUserHostNotifier *n; + void *addr; + char *name; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) || + vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) { + return -EINVAL; + } + + /* + * Fetch notifier and invalidate any old data before setting up + * new mapped address. + */ + n = fetch_or_create_notifier(user, queue_idx); + vhost_user_host_notifier_remove(n, vdev); + + if (area->u64 & VHOST_USER_VRING_NOFD_MASK) { + return 0; + } + + /* Sanity check. */ + if (area->size != page_size) { + return -EINVAL; + } + + addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, area->offset); + if (addr == MAP_FAILED) { + return -EFAULT; + } + + name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]", + user, queue_idx); + if (!n->mr.ram) { /* Don't init again after suspend. */ + memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, + page_size, addr); + } else { + n->mr.ram_block->host = addr; + } + g_free(name); + + if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) { + object_unparent(OBJECT(&n->mr)); + munmap(addr, page_size); + return -ENXIO; + } + + n->addr = addr; + + return 0; +} + +static void close_slave_channel(struct vhost_user *u) +{ + g_source_destroy(u->slave_src); + g_source_unref(u->slave_src); + u->slave_src = NULL; + object_unref(OBJECT(u->slave_ioc)); + u->slave_ioc = NULL; +} + +static gboolean slave_read(QIOChannel *ioc, GIOCondition condition, + gpointer opaque) +{ + struct vhost_dev *dev = opaque; + struct vhost_user *u = dev->opaque; + VhostUserHeader hdr = { 0, }; + VhostUserPayload payload = { 0, }; + Error *local_err = NULL; + gboolean rc = G_SOURCE_CONTINUE; + int ret = 0; + struct iovec iov; + g_autofree int *fd = NULL; + size_t fdsize = 0; + int i; + + /* Read header */ + iov.iov_base = &hdr; + iov.iov_len = VHOST_USER_HDR_SIZE; + + if (qio_channel_readv_full_all(ioc, &iov, 1, &fd, &fdsize, &local_err)) { + error_report_err(local_err); + goto err; + } + + if (hdr.size > VHOST_USER_PAYLOAD_SIZE) { + error_report("Failed to read msg header." + " Size %d exceeds the maximum %zu.", hdr.size, + VHOST_USER_PAYLOAD_SIZE); + goto err; + } + + /* Read payload */ + if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) { + error_report_err(local_err); + goto err; + } + + switch (hdr.request) { + case VHOST_USER_SLAVE_IOTLB_MSG: + ret = vhost_backend_handle_iotlb_msg(dev, &payload.iotlb); + break; + case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG : + ret = vhost_user_slave_handle_config_change(dev); + break; + case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: + ret = vhost_user_slave_handle_vring_host_notifier(dev, &payload.area, + fd ? fd[0] : -1); + break; + default: + error_report("Received unexpected msg type: %d.", hdr.request); + ret = -EINVAL; + } + + /* + * REPLY_ACK feature handling. Other reply types has to be managed + * directly in their request handlers. + */ + if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) { + struct iovec iovec[2]; + + + hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK; + hdr.flags |= VHOST_USER_REPLY_MASK; + + payload.u64 = !!ret; + hdr.size = sizeof(payload.u64); + + iovec[0].iov_base = &hdr; + iovec[0].iov_len = VHOST_USER_HDR_SIZE; + iovec[1].iov_base = &payload; + iovec[1].iov_len = hdr.size; + + if (qio_channel_writev_all(ioc, iovec, ARRAY_SIZE(iovec), &local_err)) { + error_report_err(local_err); + goto err; + } + } + + goto fdcleanup; + +err: + close_slave_channel(u); + rc = G_SOURCE_REMOVE; + +fdcleanup: + if (fd) { + for (i = 0; i < fdsize; i++) { + close(fd[i]); + } + } + return rc; +} + +static int vhost_setup_slave_channel(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_SLAVE_REQ_FD, + .hdr.flags = VHOST_USER_VERSION, + }; + struct vhost_user *u = dev->opaque; + int sv[2], ret = 0; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + Error *local_err = NULL; + QIOChannel *ioc; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + return 0; + } + + if (qemu_socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) { + int saved_errno = errno; + error_report("socketpair() failed"); + return -saved_errno; + } + + ioc = QIO_CHANNEL(qio_channel_socket_new_fd(sv[0], &local_err)); + if (!ioc) { + error_report_err(local_err); + return -ECONNREFUSED; + } + u->slave_ioc = ioc; + slave_update_read_handler(dev, NULL); + + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + ret = vhost_user_write(dev, &msg, &sv[1], 1); + if (ret) { + goto out; + } + + if (reply_supported) { + ret = process_message_reply(dev, &msg); + } + +out: + close(sv[1]); + if (ret) { + close_slave_channel(u); + } + + return ret; +} + +#ifdef CONFIG_LINUX +/* + * Called back from the postcopy fault thread when a fault is received on our + * ufd. + * TODO: This is Linux specific + */ +static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd, + void *ufd) +{ + struct vhost_dev *dev = pcfd->data; + struct vhost_user *u = dev->opaque; + struct uffd_msg *msg = ufd; + uint64_t faultaddr = msg->arg.pagefault.address; + RAMBlock *rb = NULL; + uint64_t rb_offset; + int i; + + trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr, + dev->mem->nregions); + for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) { + trace_vhost_user_postcopy_fault_handler_loop(i, + u->postcopy_client_bases[i], dev->mem->regions[i].memory_size); + if (faultaddr >= u->postcopy_client_bases[i]) { + /* Ofset of the fault address in the vhost region */ + uint64_t region_offset = faultaddr - u->postcopy_client_bases[i]; + if (region_offset < dev->mem->regions[i].memory_size) { + rb_offset = region_offset + u->region_rb_offset[i]; + trace_vhost_user_postcopy_fault_handler_found(i, + region_offset, rb_offset); + rb = u->region_rb[i]; + return postcopy_request_shared_page(pcfd, rb, faultaddr, + rb_offset); + } + } + } + error_report("%s: Failed to find region for fault %" PRIx64, + __func__, faultaddr); + return -1; +} + +static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t offset) +{ + struct vhost_dev *dev = pcfd->data; + struct vhost_user *u = dev->opaque; + int i; + + trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset); + + if (!u) { + return 0; + } + /* Translate the offset into an address in the clients address space */ + for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) { + if (u->region_rb[i] == rb && + offset >= u->region_rb_offset[i] && + offset < (u->region_rb_offset[i] + + dev->mem->regions[i].memory_size)) { + uint64_t client_addr = (offset - u->region_rb_offset[i]) + + u->postcopy_client_bases[i]; + trace_vhost_user_postcopy_waker_found(client_addr); + return postcopy_wake_shared(pcfd, client_addr, rb); + } + } + + trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset); + return 0; +} +#endif + +/* + * Called at the start of an inbound postcopy on reception of the + * 'advise' command. + */ +static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) +{ +#ifdef CONFIG_LINUX + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + int ufd; + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_ADVISE, + .hdr.flags = VHOST_USER_VERSION, + }; + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_setg(errp, "Failed to send postcopy_advise to vhost"); + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + error_setg(errp, "Failed to get postcopy_advise reply from vhost"); + return ret; + } + + if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) { + error_setg(errp, "Unexpected msg type. Expected %d received %d", + VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size) { + error_setg(errp, "Received bad msg size."); + return -EPROTO; + } + ufd = qemu_chr_fe_get_msgfd(chr); + if (ufd < 0) { + error_setg(errp, "%s: Failed to get ufd", __func__); + return -EIO; + } + qemu_socket_set_nonblock(ufd); + + /* register ufd with userfault thread */ + u->postcopy_fd.fd = ufd; + u->postcopy_fd.data = dev; + u->postcopy_fd.handler = vhost_user_postcopy_fault_handler; + u->postcopy_fd.waker = vhost_user_postcopy_waker; + u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */ + postcopy_register_shared_ufd(&u->postcopy_fd); + return 0; +#else + error_setg(errp, "Postcopy not supported on non-Linux systems"); + return -ENOSYS; +#endif +} + +/* + * Called at the switch to postcopy on reception of the 'listen' command. + */ +static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp) +{ + struct vhost_user *u = dev->opaque; + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_LISTEN, + .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, + }; + u->postcopy_listen = true; + + trace_vhost_user_postcopy_listen(); + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_setg(errp, "Failed to send postcopy_listen to vhost"); + return ret; + } + + ret = process_message_reply(dev, &msg); + if (ret) { + error_setg(errp, "Failed to receive reply to postcopy_listen"); + return ret; + } + + return 0; +} + +/* + * Called at the end of postcopy + */ +static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_END, + .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, + }; + int ret; + struct vhost_user *u = dev->opaque; + + trace_vhost_user_postcopy_end_entry(); + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_setg(errp, "Failed to send postcopy_end to vhost"); + return ret; + } + + ret = process_message_reply(dev, &msg); + if (ret) { + error_setg(errp, "Failed to receive reply to postcopy_end"); + return ret; + } + postcopy_unregister_shared_ufd(&u->postcopy_fd); + close(u->postcopy_fd.fd); + u->postcopy_fd.handler = NULL; + + trace_vhost_user_postcopy_end_exit(); + + return 0; +} + +static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, + void *opaque) +{ + struct PostcopyNotifyData *pnd = opaque; + struct vhost_user *u = container_of(notifier, struct vhost_user, + postcopy_notifier); + struct vhost_dev *dev = u->dev; + + switch (pnd->reason) { + case POSTCOPY_NOTIFY_PROBE: + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_PAGEFAULT)) { + /* TODO: Get the device name into this error somehow */ + error_setg(pnd->errp, + "vhost-user backend not capable of postcopy"); + return -ENOENT; + } + break; + + case POSTCOPY_NOTIFY_INBOUND_ADVISE: + return vhost_user_postcopy_advise(dev, pnd->errp); + + case POSTCOPY_NOTIFY_INBOUND_LISTEN: + return vhost_user_postcopy_listen(dev, pnd->errp); + + case POSTCOPY_NOTIFY_INBOUND_END: + return vhost_user_postcopy_end(dev, pnd->errp); + + default: + /* We ignore notifications we don't know */ + break; + } + + return 0; +} + +static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, + Error **errp) +{ + uint64_t features, ram_slots; + struct vhost_user *u; + VhostUserState *vus = (VhostUserState *) opaque; + int err; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + u = g_new0(struct vhost_user, 1); + u->user = vus; + u->dev = dev; + dev->opaque = u; + + err = vhost_user_get_features(dev, &features); + if (err < 0) { + error_setg_errno(errp, -err, "vhost_backend_init failed"); + return err; + } + + if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { + bool supports_f_config = vus->supports_config || + (dev->config_ops && dev->config_ops->vhost_dev_config_notifier); + uint64_t protocol_features; + + dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, + &protocol_features); + if (err < 0) { + error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); + return -EPROTO; + } + + /* + * We will use all the protocol features we support - although + * we suppress F_CONFIG if we know QEMUs internal code can not support + * it. + */ + protocol_features &= VHOST_USER_PROTOCOL_FEATURE_MASK; + + if (supports_f_config) { + if (!virtio_has_feature(protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + error_setg(errp, "vhost-user device expecting " + "VHOST_USER_PROTOCOL_F_CONFIG but the vhost-user backend does " + "not support it."); + return -EPROTO; + } + } else { + if (virtio_has_feature(protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + warn_reportf_err(*errp, "vhost-user backend supports " + "VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not."); + protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG); + } + } + + /* final set of protocol features */ + dev->protocol_features = protocol_features; + err = vhost_user_set_protocol_features(dev, dev->protocol_features); + if (err < 0) { + error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); + return -EPROTO; + } + + /* query the max queues we support if backend supports Multiple Queue */ + if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) { + err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM, + &dev->max_queues); + if (err < 0) { + error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); + return -EPROTO; + } + } else { + dev->max_queues = 1; + } + + if (dev->num_queues && dev->max_queues < dev->num_queues) { + error_setg(errp, "The maximum number of queues supported by the " + "backend is %" PRIu64, dev->max_queues); + return -EINVAL; + } + + if (virtio_has_feature(features, VIRTIO_F_IOMMU_PLATFORM) && + !(virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_SLAVE_REQ) && + virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK))) { + error_setg(errp, "IOMMU support requires reply-ack and " + "slave-req protocol features."); + return -EINVAL; + } + + /* get max memory regions if backend supports configurable RAM slots */ + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) { + u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS; + } else { + err = vhost_user_get_max_memslots(dev, &ram_slots); + if (err < 0) { + error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); + return -EPROTO; + } + + if (ram_slots < u->user->memory_slots) { + error_setg(errp, "The backend specified a max ram slots limit " + "of %" PRIu64", when the prior validated limit was " + "%d. This limit should never decrease.", ram_slots, + u->user->memory_slots); + return -EINVAL; + } + + u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS); + } + } + + if (dev->migration_blocker == NULL && + !virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD)) { + error_setg(&dev->migration_blocker, + "Migration disabled: vhost-user backend lacks " + "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature."); + } + + if (dev->vq_index == 0) { + err = vhost_setup_slave_channel(dev); + if (err < 0) { + error_setg_errno(errp, EPROTO, "vhost_backend_init failed"); + return -EPROTO; + } + } + + u->postcopy_notifier.notify = vhost_user_postcopy_notifier; + postcopy_add_notifier(&u->postcopy_notifier); + + return 0; +} + +static int vhost_user_backend_cleanup(struct vhost_dev *dev) +{ + struct vhost_user *u; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + u = dev->opaque; + if (u->postcopy_notifier.notify) { + postcopy_remove_notifier(&u->postcopy_notifier); + u->postcopy_notifier.notify = NULL; + } + u->postcopy_listen = false; + if (u->postcopy_fd.handler) { + postcopy_unregister_shared_ufd(&u->postcopy_fd); + close(u->postcopy_fd.fd); + u->postcopy_fd.handler = NULL; + } + if (u->slave_ioc) { + close_slave_channel(u); + } + g_free(u->region_rb); + u->region_rb = NULL; + g_free(u->region_rb_offset); + u->region_rb_offset = NULL; + u->region_rb_len = 0; + g_free(u); + dev->opaque = 0; + + return 0; +} + +static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx) +{ + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + return idx; +} + +static int vhost_user_memslots_limit(struct vhost_dev *dev) +{ + struct vhost_user *u = dev->opaque; + + return u->user->memory_slots; +} + +static bool vhost_user_requires_shm_log(struct vhost_dev *dev) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + return virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); +} + +static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) +{ + VhostUserMsg msg = { }; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + /* If guest supports GUEST_ANNOUNCE do nothing */ + if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) { + return 0; + } + + /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */ + if (virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RARP)) { + msg.hdr.request = VHOST_USER_SEND_RARP; + msg.hdr.flags = VHOST_USER_VERSION; + memcpy((char *)&msg.payload.u64, mac_addr, 6); + msg.hdr.size = sizeof(msg.payload.u64); + + return vhost_user_write(dev, &msg, NULL, 0); + } + return -ENOTSUP; +} + +static bool vhost_user_can_merge(struct vhost_dev *dev, + uint64_t start1, uint64_t size1, + uint64_t start2, uint64_t size2) +{ + ram_addr_t offset; + int mfd, rfd; + + (void)vhost_user_get_mr_data(start1, &offset, &mfd); + (void)vhost_user_get_mr_data(start2, &offset, &rfd); + + return mfd == rfd; +} + +static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu) +{ + VhostUserMsg msg; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + int ret; + + if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))) { + return 0; + } + + msg.hdr.request = VHOST_USER_NET_SET_MTU; + msg.payload.u64 = mtu; + msg.hdr.size = sizeof(msg.payload.u64); + msg.hdr.flags = VHOST_USER_VERSION; + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + /* If reply_ack supported, slave has to ack specified MTU is valid */ + if (reply_supported) { + return process_message_reply(dev, &msg); + } + + return 0; +} + +static int vhost_user_send_device_iotlb_msg(struct vhost_dev *dev, + struct vhost_iotlb_msg *imsg) +{ + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_IOTLB_MSG, + .hdr.size = sizeof(msg.payload.iotlb), + .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, + .payload.iotlb = *imsg, + }; + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + return process_message_reply(dev, &msg); +} + + +static void vhost_user_set_iotlb_callback(struct vhost_dev *dev, int enabled) +{ + /* No-op as the receive channel is not dedicated to IOTLB messages. */ +} + +static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config, + uint32_t config_len, Error **errp) +{ + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_GET_CONFIG, + .hdr.flags = VHOST_USER_VERSION, + .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + config_len, + }; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + error_setg(errp, "VHOST_USER_PROTOCOL_F_CONFIG not supported"); + return -EINVAL; + } + + assert(config_len <= VHOST_USER_MAX_CONFIG_SIZE); + + msg.payload.config.offset = 0; + msg.payload.config.size = config_len; + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, "vhost_get_config failed"); + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + error_setg_errno(errp, -ret, "vhost_get_config failed"); + return ret; + } + + if (msg.hdr.request != VHOST_USER_GET_CONFIG) { + error_setg(errp, + "Received unexpected msg type. Expected %d received %d", + VHOST_USER_GET_CONFIG, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) { + error_setg(errp, "Received bad msg size."); + return -EPROTO; + } + + memcpy(config, msg.payload.config.region, config_len); + + return 0; +} + +static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data, + uint32_t offset, uint32_t size, uint32_t flags) +{ + int ret; + uint8_t *p; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_CONFIG, + .hdr.flags = VHOST_USER_VERSION, + .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + size, + }; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CONFIG)) { + return -ENOTSUP; + } + + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + if (size > VHOST_USER_MAX_CONFIG_SIZE) { + return -EINVAL; + } + + msg.payload.config.offset = offset, + msg.payload.config.size = size, + msg.payload.config.flags = flags, + p = msg.payload.config.region; + memcpy(p, data, size); + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + if (reply_supported) { + return process_message_reply(dev, &msg); + } + + return 0; +} + +static int vhost_user_crypto_create_session(struct vhost_dev *dev, + void *session_info, + uint64_t *session_id) +{ + int ret; + bool crypto_session = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION); + CryptoDevBackendSymSessionInfo *sess_info = session_info; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_CREATE_CRYPTO_SESSION, + .hdr.flags = VHOST_USER_VERSION, + .hdr.size = sizeof(msg.payload.session), + }; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + if (!crypto_session) { + error_report("vhost-user trying to send unhandled ioctl"); + return -ENOTSUP; + } + + memcpy(&msg.payload.session.session_setup_data, sess_info, + sizeof(CryptoDevBackendSymSessionInfo)); + if (sess_info->key_len) { + memcpy(&msg.payload.session.key, sess_info->cipher_key, + sess_info->key_len); + } + if (sess_info->auth_key_len > 0) { + memcpy(&msg.payload.session.auth_key, sess_info->auth_key, + sess_info->auth_key_len); + } + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_report("vhost_user_write() return %d, create session failed", + ret); + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + error_report("vhost_user_read() return %d, create session failed", + ret); + return ret; + } + + if (msg.hdr.request != VHOST_USER_CREATE_CRYPTO_SESSION) { + error_report("Received unexpected msg type. Expected %d received %d", + VHOST_USER_CREATE_CRYPTO_SESSION, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size != sizeof(msg.payload.session)) { + error_report("Received bad msg size."); + return -EPROTO; + } + + if (msg.payload.session.session_id < 0) { + error_report("Bad session id: %" PRId64 "", + msg.payload.session.session_id); + return -EINVAL; + } + *session_id = msg.payload.session.session_id; + + return 0; +} + +static int +vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id) +{ + int ret; + bool crypto_session = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION); + VhostUserMsg msg = { + .hdr.request = VHOST_USER_CLOSE_CRYPTO_SESSION, + .hdr.flags = VHOST_USER_VERSION, + .hdr.size = sizeof(msg.payload.u64), + }; + msg.payload.u64 = session_id; + + if (!crypto_session) { + error_report("vhost-user trying to send unhandled ioctl"); + return -ENOTSUP; + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + error_report("vhost_user_write() return %d, close session failed", + ret); + return ret; + } + + return 0; +} + +static bool vhost_user_mem_section_filter(struct vhost_dev *dev, + MemoryRegionSection *section) +{ + bool result; + + result = memory_region_get_fd(section->mr) >= 0; + + return result; +} + +static int vhost_user_get_inflight_fd(struct vhost_dev *dev, + uint16_t queue_size, + struct vhost_inflight *inflight) +{ + void *addr; + int fd; + int ret; + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->user->chr; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_GET_INFLIGHT_FD, + .hdr.flags = VHOST_USER_VERSION, + .payload.inflight.num_queues = dev->nvqs, + .payload.inflight.queue_size = queue_size, + .hdr.size = sizeof(msg.payload.inflight), + }; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { + return 0; + } + + ret = vhost_user_write(dev, &msg, NULL, 0); + if (ret < 0) { + return ret; + } + + ret = vhost_user_read(dev, &msg); + if (ret < 0) { + return ret; + } + + if (msg.hdr.request != VHOST_USER_GET_INFLIGHT_FD) { + error_report("Received unexpected msg type. " + "Expected %d received %d", + VHOST_USER_GET_INFLIGHT_FD, msg.hdr.request); + return -EPROTO; + } + + if (msg.hdr.size != sizeof(msg.payload.inflight)) { + error_report("Received bad msg size."); + return -EPROTO; + } + + if (!msg.payload.inflight.mmap_size) { + return 0; + } + + fd = qemu_chr_fe_get_msgfd(chr); + if (fd < 0) { + error_report("Failed to get mem fd"); + return -EIO; + } + + addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, msg.payload.inflight.mmap_offset); + + if (addr == MAP_FAILED) { + error_report("Failed to mmap mem fd"); + close(fd); + return -EFAULT; + } + + inflight->addr = addr; + inflight->fd = fd; + inflight->size = msg.payload.inflight.mmap_size; + inflight->offset = msg.payload.inflight.mmap_offset; + inflight->queue_size = queue_size; + + return 0; +} + +static int vhost_user_set_inflight_fd(struct vhost_dev *dev, + struct vhost_inflight *inflight) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_INFLIGHT_FD, + .hdr.flags = VHOST_USER_VERSION, + .payload.inflight.mmap_size = inflight->size, + .payload.inflight.mmap_offset = inflight->offset, + .payload.inflight.num_queues = dev->nvqs, + .payload.inflight.queue_size = inflight->queue_size, + .hdr.size = sizeof(msg.payload.inflight), + }; + + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) { + return 0; + } + + return vhost_user_write(dev, &msg, &inflight->fd, 1); +} + +static void vhost_user_state_destroy(gpointer data) +{ + VhostUserHostNotifier *n = (VhostUserHostNotifier *) data; + if (n) { + vhost_user_host_notifier_remove(n, NULL); + object_unparent(OBJECT(&n->mr)); + /* + * We can't free until vhost_user_host_notifier_remove has + * done it's thing so schedule the free with RCU. + */ + g_free_rcu(n, rcu); + } +} + +bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) +{ + if (user->chr) { + error_setg(errp, "Cannot initialize vhost-user state"); + return false; + } + user->chr = chr; + user->memory_slots = 0; + user->notifiers = g_ptr_array_new_full(VIRTIO_QUEUE_MAX / 4, + &vhost_user_state_destroy); + return true; +} + +void vhost_user_cleanup(VhostUserState *user) +{ + if (!user->chr) { + return; + } + memory_region_transaction_begin(); + user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true); + memory_region_transaction_commit(); + user->chr = NULL; +} + + +typedef struct { + vu_async_close_fn cb; + DeviceState *dev; + CharBackend *cd; + struct vhost_dev *vhost; +} VhostAsyncCallback; + +static void vhost_user_async_close_bh(void *opaque) +{ + VhostAsyncCallback *data = opaque; + struct vhost_dev *vhost = data->vhost; + + /* + * If the vhost_dev has been cleared in the meantime there is + * nothing left to do as some other path has completed the + * cleanup. + */ + if (vhost->vdev) { + data->cb(data->dev); + } + + g_free(data); +} + +/* + * We only schedule the work if the machine is running. If suspended + * we want to keep all the in-flight data as is for migration + * purposes. + */ +void vhost_user_async_close(DeviceState *d, + CharBackend *chardev, struct vhost_dev *vhost, + vu_async_close_fn cb) +{ + if (!runstate_check(RUN_STATE_SHUTDOWN)) { + /* + * A close event may happen during a read/write, but vhost + * code assumes the vhost_dev remains setup, so delay the + * stop & clear. + */ + AioContext *ctx = qemu_get_current_aio_context(); + VhostAsyncCallback *data = g_new0(VhostAsyncCallback, 1); + + /* Save data for the callback */ + data->cb = cb; + data->dev = d; + data->cd = chardev; + data->vhost = vhost; + + /* Disable any further notifications on the chardev */ + qemu_chr_fe_set_handlers(chardev, + NULL, NULL, NULL, NULL, NULL, NULL, + false); + + aio_bh_schedule_oneshot(ctx, vhost_user_async_close_bh, data); + + /* + * Move vhost device to the stopped state. The vhost-user device + * will be clean up and disconnected in BH. This can be useful in + * the vhost migration code. If disconnect was caught there is an + * option for the general vhost code to get the dev state without + * knowing its type (in this case vhost-user). + * + * Note if the vhost device is fully cleared by the time we + * execute the bottom half we won't continue with the cleanup. + */ + vhost->started = false; + } +} + +static int vhost_user_dev_start(struct vhost_dev *dev, bool started) +{ + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_STATUS)) { + return 0; + } + + /* Set device status only for last queue pair */ + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + if (started) { + return vhost_user_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | + VIRTIO_CONFIG_S_DRIVER | + VIRTIO_CONFIG_S_DRIVER_OK); + } else { + return vhost_user_set_status(dev, 0); + } +} + +const VhostOps user_ops = { + .backend_type = VHOST_BACKEND_TYPE_USER, + .vhost_backend_init = vhost_user_backend_init, + .vhost_backend_cleanup = vhost_user_backend_cleanup, + .vhost_backend_memslots_limit = vhost_user_memslots_limit, + .vhost_set_log_base = vhost_user_set_log_base, + .vhost_set_mem_table = vhost_user_set_mem_table, + .vhost_set_vring_addr = vhost_user_set_vring_addr, + .vhost_set_vring_endian = vhost_user_set_vring_endian, + .vhost_set_vring_num = vhost_user_set_vring_num, + .vhost_set_vring_base = vhost_user_set_vring_base, + .vhost_get_vring_base = vhost_user_get_vring_base, + .vhost_set_vring_kick = vhost_user_set_vring_kick, + .vhost_set_vring_call = vhost_user_set_vring_call, + .vhost_set_vring_err = vhost_user_set_vring_err, + .vhost_set_features = vhost_user_set_features, + .vhost_get_features = vhost_user_get_features, + .vhost_set_owner = vhost_user_set_owner, + .vhost_reset_device = vhost_user_reset_device, + .vhost_get_vq_index = vhost_user_get_vq_index, + .vhost_set_vring_enable = vhost_user_set_vring_enable, + .vhost_requires_shm_log = vhost_user_requires_shm_log, + .vhost_migration_done = vhost_user_migration_done, + .vhost_backend_can_merge = vhost_user_can_merge, + .vhost_net_set_mtu = vhost_user_net_set_mtu, + .vhost_set_iotlb_callback = vhost_user_set_iotlb_callback, + .vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg, + .vhost_get_config = vhost_user_get_config, + .vhost_set_config = vhost_user_set_config, + .vhost_crypto_create_session = vhost_user_crypto_create_session, + .vhost_crypto_close_session = vhost_user_crypto_close_session, + .vhost_backend_mem_section_filter = vhost_user_mem_section_filter, + .vhost_get_inflight_fd = vhost_user_get_inflight_fd, + .vhost_set_inflight_fd = vhost_user_set_inflight_fd, + .vhost_dev_start = vhost_user_dev_start, +}; diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c new file mode 100644 index 00000000..7468e44b --- /dev/null +++ b/hw/virtio/vhost-vdpa.c @@ -0,0 +1,1313 @@ +/* + * vhost-vdpa + * + * Copyright(c) 2017-2018 Intel Corporation. + * Copyright(c) 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include <linux/vhost.h> +#include <linux/vfio.h> +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-backend.h" +#include "hw/virtio/virtio-net.h" +#include "hw/virtio/vhost-shadow-virtqueue.h" +#include "hw/virtio/vhost-vdpa.h" +#include "exec/address-spaces.h" +#include "migration/blocker.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" +#include "cpu.h" +#include "trace.h" +#include "qapi/error.h" + +/* + * Return one past the end of the end of section. Be careful with uint64_t + * conversions! + */ +static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section) +{ + Int128 llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); + + return llend; +} + +static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section, + uint64_t iova_min, + uint64_t iova_max) +{ + Int128 llend; + + if ((!memory_region_is_ram(section->mr) && + !memory_region_is_iommu(section->mr)) || + memory_region_is_protected(section->mr) || + /* vhost-vDPA doesn't allow MMIO to be mapped */ + memory_region_is_ram_device(section->mr)) { + return true; + } + + if (section->offset_within_address_space < iova_min) { + error_report("RAM section out of device range (min=0x%" PRIx64 + ", addr=0x%" HWADDR_PRIx ")", + iova_min, section->offset_within_address_space); + return true; + } + + llend = vhost_vdpa_section_end(section); + if (int128_gt(llend, int128_make64(iova_max))) { + error_report("RAM section out of device range (max=0x%" PRIx64 + ", end addr=0x%" PRIx64 ")", + iova_max, int128_get64(llend)); + return true; + } + + return false; +} + +int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size, + void *vaddr, bool readonly) +{ + struct vhost_msg_v2 msg = {}; + int fd = v->device_fd; + int ret = 0; + + msg.type = v->msg_type; + msg.iotlb.iova = iova; + msg.iotlb.size = size; + msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr; + msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW; + msg.iotlb.type = VHOST_IOTLB_UPDATE; + + trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size, + msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type); + + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { + error_report("failed to write, fd=%d, errno=%d (%s)", + fd, errno, strerror(errno)); + return -EIO ; + } + + return ret; +} + +int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size) +{ + struct vhost_msg_v2 msg = {}; + int fd = v->device_fd; + int ret = 0; + + msg.type = v->msg_type; + msg.iotlb.iova = iova; + msg.iotlb.size = size; + msg.iotlb.type = VHOST_IOTLB_INVALIDATE; + + trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova, + msg.iotlb.size, msg.iotlb.type); + + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { + error_report("failed to write, fd=%d, errno=%d (%s)", + fd, errno, strerror(errno)); + return -EIO ; + } + + return ret; +} + +static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v) +{ + int fd = v->device_fd; + struct vhost_msg_v2 msg = { + .type = v->msg_type, + .iotlb.type = VHOST_IOTLB_BATCH_BEGIN, + }; + + trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type); + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { + error_report("failed to write, fd=%d, errno=%d (%s)", + fd, errno, strerror(errno)); + } +} + +static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v) +{ + if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) && + !v->iotlb_batch_begin_sent) { + vhost_vdpa_listener_begin_batch(v); + } + + v->iotlb_batch_begin_sent = true; +} + +static void vhost_vdpa_listener_commit(MemoryListener *listener) +{ + struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); + struct vhost_dev *dev = v->dev; + struct vhost_msg_v2 msg = {}; + int fd = v->device_fd; + + if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) { + return; + } + + if (!v->iotlb_batch_begin_sent) { + return; + } + + msg.type = v->msg_type; + msg.iotlb.type = VHOST_IOTLB_BATCH_END; + + trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type); + if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) { + error_report("failed to write, fd=%d, errno=%d (%s)", + fd, errno, strerror(errno)); + } + + v->iotlb_batch_begin_sent = false; +} + +static void vhost_vdpa_listener_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + DMAMap mem_region = {}; + struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); + hwaddr iova; + Int128 llend, llsize; + void *vaddr; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, + v->iova_range.last)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != + (section->offset_within_region & ~TARGET_PAGE_MASK))) { + error_report("%s received unaligned region", __func__); + return; + } + + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + llend = vhost_vdpa_section_end(section); + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + memory_region_ref(section->mr); + + /* Here we assume that memory_region_is_ram(section->mr)==true */ + + vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + + trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend), + vaddr, section->readonly); + + llsize = int128_sub(llend, int128_make64(iova)); + if (v->shadow_vqs_enabled) { + int r; + + mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr, + mem_region.size = int128_get64(llsize) - 1, + mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly), + + r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region); + if (unlikely(r != IOVA_OK)) { + error_report("Can't allocate a mapping (%d)", r); + goto fail; + } + + iova = mem_region.iova; + } + + vhost_vdpa_iotlb_batch_begin_once(v); + ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize), + vaddr, section->readonly); + if (ret) { + error_report("vhost vdpa map fail!"); + goto fail_map; + } + + return; + +fail_map: + if (v->shadow_vqs_enabled) { + vhost_iova_tree_remove(v->iova_tree, mem_region); + } + +fail: + /* + * On the initfn path, store the first error in the container so we + * can gracefully fail. Runtime, there's not much we can do other + * than throw a hardware error. + */ + error_report("vhost-vdpa: DMA mapping failed, unable to continue"); + return; + +} + +static void vhost_vdpa_listener_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener); + hwaddr iova; + Int128 llend, llsize; + int ret; + + if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first, + v->iova_range.last)) { + return; + } + + if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) != + (section->offset_within_region & ~TARGET_PAGE_MASK))) { + error_report("%s received unaligned region", __func__); + return; + } + + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); + llend = vhost_vdpa_section_end(section); + + trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend)); + + if (int128_ge(int128_make64(iova), llend)) { + return; + } + + llsize = int128_sub(llend, int128_make64(iova)); + + if (v->shadow_vqs_enabled) { + const DMAMap *result; + const void *vaddr = memory_region_get_ram_ptr(section->mr) + + section->offset_within_region + + (iova - section->offset_within_address_space); + DMAMap mem_region = { + .translated_addr = (hwaddr)(uintptr_t)vaddr, + .size = int128_get64(llsize) - 1, + }; + + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region); + if (!result) { + /* The memory listener map wasn't mapped */ + return; + } + iova = result->iova; + vhost_iova_tree_remove(v->iova_tree, *result); + } + vhost_vdpa_iotlb_batch_begin_once(v); + ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize)); + if (ret) { + error_report("vhost_vdpa dma unmap error!"); + } + + memory_region_unref(section->mr); +} +/* + * IOTLB API is used by vhost-vdpa which requires incremental updating + * of the mapping. So we can not use generic vhost memory listener which + * depends on the addnop(). + */ +static const MemoryListener vhost_vdpa_memory_listener = { + .name = "vhost-vdpa", + .commit = vhost_vdpa_listener_commit, + .region_add = vhost_vdpa_listener_region_add, + .region_del = vhost_vdpa_listener_region_del, +}; + +static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + struct vhost_vdpa *v = dev->opaque; + int fd = v->device_fd; + int ret; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); + + ret = ioctl(fd, request, arg); + return ret < 0 ? -errno : ret; +} + +static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status) +{ + uint8_t s; + int ret; + + trace_vhost_vdpa_add_status(dev, status); + ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); + if (ret < 0) { + return ret; + } + + s |= status; + + ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s); + if (ret < 0) { + return ret; + } + + ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s); + if (ret < 0) { + return ret; + } + + if (!(s & status)) { + return -EIO; + } + + return 0; +} + +static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v) +{ + int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE, + &v->iova_range); + if (ret != 0) { + v->iova_range.first = 0; + v->iova_range.last = UINT64_MAX; + } + + trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first, + v->iova_range.last); +} + +/* + * The use of this function is for requests that only need to be + * applied once. Typically such request occurs at the beginning + * of operation, and before setting up queues. It should not be + * used for request that performs operation until all queues are + * set, which would need to check dev->vq_index_end instead. + */ +static bool vhost_vdpa_first_dev(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + + return v->index == 0; +} + +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev, + uint64_t *features) +{ + int ret; + + ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features); + trace_vhost_vdpa_get_features(dev, *features); + return ret; +} + +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v, + Error **errp) +{ + g_autoptr(GPtrArray) shadow_vqs = NULL; + uint64_t dev_features, svq_features; + int r; + bool ok; + + if (!v->shadow_vqs_enabled) { + return 0; + } + + r = vhost_vdpa_get_dev_features(hdev, &dev_features); + if (r != 0) { + error_setg_errno(errp, -r, "Can't get vdpa device features"); + return r; + } + + svq_features = dev_features; + ok = vhost_svq_valid_features(svq_features, errp); + if (unlikely(!ok)) { + return -1; + } + + shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free); + for (unsigned n = 0; n < hdev->nvqs; ++n) { + g_autoptr(VhostShadowVirtqueue) svq; + + svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops, + v->shadow_vq_ops_opaque); + if (unlikely(!svq)) { + error_setg(errp, "Cannot create svq %u", n); + return -1; + } + g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq)); + } + + v->shadow_vqs = g_steal_pointer(&shadow_vqs); + return 0; +} + +static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp) +{ + struct vhost_vdpa *v; + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); + trace_vhost_vdpa_init(dev, opaque); + int ret; + + /* + * Similar to VFIO, we end up pinning all guest memory and have to + * disable discarding of RAM. + */ + ret = ram_block_discard_disable(true); + if (ret) { + error_report("Cannot set discarding of RAM broken"); + return ret; + } + + v = opaque; + v->dev = dev; + dev->opaque = opaque ; + v->listener = vhost_vdpa_memory_listener; + v->msg_type = VHOST_IOTLB_MSG_V2; + ret = vhost_vdpa_init_svq(dev, v, errp); + if (ret) { + goto err; + } + + vhost_vdpa_get_iova_range(v); + + if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | + VIRTIO_CONFIG_S_DRIVER); + + return 0; + +err: + ram_block_discard_disable(false); + return ret; +} + +static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, + int queue_index) +{ + size_t page_size = qemu_real_host_page_size(); + struct vhost_vdpa *v = dev->opaque; + VirtIODevice *vdev = dev->vdev; + VhostVDPAHostNotifier *n; + + n = &v->notifier[queue_index]; + + if (n->addr) { + virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false); + object_unparent(OBJECT(&n->mr)); + munmap(n->addr, page_size); + n->addr = NULL; + } +} + +static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) +{ + size_t page_size = qemu_real_host_page_size(); + struct vhost_vdpa *v = dev->opaque; + VirtIODevice *vdev = dev->vdev; + VhostVDPAHostNotifier *n; + int fd = v->device_fd; + void *addr; + char *name; + + vhost_vdpa_host_notifier_uninit(dev, queue_index); + + n = &v->notifier[queue_index]; + + addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd, + queue_index * page_size); + if (addr == MAP_FAILED) { + goto err; + } + + name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]", + v, queue_index); + memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, + page_size, addr); + g_free(name); + + if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { + object_unparent(OBJECT(&n->mr)); + munmap(addr, page_size); + goto err; + } + n->addr = addr; + + return 0; + +err: + return -1; +} + +static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) +{ + int i; + + for (i = dev->vq_index; i < dev->vq_index + n; i++) { + vhost_vdpa_host_notifier_uninit(dev, i); + } +} + +static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + int i; + + if (v->shadow_vqs_enabled) { + /* FIXME SVQ is not compatible with host notifiers mr */ + return; + } + + for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) { + if (vhost_vdpa_host_notifier_init(dev, i)) { + goto err; + } + } + + return; + +err: + vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); + return; +} + +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + size_t idx; + + if (!v->shadow_vqs) { + return; + } + + for (idx = 0; idx < v->shadow_vqs->len; ++idx) { + vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); + } + g_ptr_array_free(v->shadow_vqs, true); +} + +static int vhost_vdpa_cleanup(struct vhost_dev *dev) +{ + struct vhost_vdpa *v; + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); + v = dev->opaque; + trace_vhost_vdpa_cleanup(dev, v); + vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); + memory_listener_unregister(&v->listener); + vhost_vdpa_svq_cleanup(dev); + + dev->opaque = NULL; + ram_block_discard_disable(false); + + return 0; +} + +static int vhost_vdpa_memslots_limit(struct vhost_dev *dev) +{ + trace_vhost_vdpa_memslots_limit(dev, INT_MAX); + return INT_MAX; +} + +static int vhost_vdpa_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + + trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding); + if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) && + trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) { + int i; + for (i = 0; i < mem->nregions; i++) { + trace_vhost_vdpa_dump_regions(dev, i, + mem->regions[i].guest_phys_addr, + mem->regions[i].memory_size, + mem->regions[i].userspace_addr, + mem->regions[i].flags_padding); + } + } + if (mem->padding) { + return -EINVAL; + } + + return 0; +} + +static int vhost_vdpa_set_features(struct vhost_dev *dev, + uint64_t features) +{ + struct vhost_vdpa *v = dev->opaque; + int ret; + + if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + + if (v->shadow_vqs_enabled) { + if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) { + /* + * QEMU is just trying to enable or disable logging. SVQ handles + * this sepparately, so no need to forward this. + */ + v->acked_features = features; + return 0; + } + + v->acked_features = features; + + /* We must not ack _F_LOG if SVQ is enabled */ + features &= ~BIT_ULL(VHOST_F_LOG_ALL); + } + + trace_vhost_vdpa_set_features(dev, features); + ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features); + if (ret) { + return ret; + } + + return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); +} + +static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev) +{ + uint64_t features; + uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 | + 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH; + int r; + + if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) { + return -EFAULT; + } + + features &= f; + + if (vhost_vdpa_first_dev(dev)) { + r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features); + if (r) { + return -EFAULT; + } + } + + dev->backend_cap = features; + + return 0; +} + +static int vhost_vdpa_get_device_id(struct vhost_dev *dev, + uint32_t *device_id) +{ + int ret; + ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id); + trace_vhost_vdpa_get_device_id(dev, *device_id); + return ret; +} + +static void vhost_vdpa_reset_svq(struct vhost_vdpa *v) +{ + if (!v->shadow_vqs_enabled) { + return; + } + + for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); + vhost_svq_stop(svq); + } +} + +static int vhost_vdpa_reset_device(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + int ret; + uint8_t status = 0; + + vhost_vdpa_reset_svq(v); + + ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status); + trace_vhost_vdpa_reset_device(dev, status); + return ret; +} + +static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) +{ + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + trace_vhost_vdpa_get_vq_index(dev, idx, idx); + return idx; +} + +static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev) +{ + int i; + trace_vhost_vdpa_set_vring_ready(dev); + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_vring_state state = { + .index = dev->vq_index + i, + .num = 1, + }; + vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state); + } + return 0; +} + +static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config, + uint32_t config_len) +{ + int b, len; + char line[QEMU_HEXDUMP_LINE_LEN]; + + for (b = 0; b < config_len; b += 16) { + len = config_len - b; + qemu_hexdump_line(line, b, config, len, false); + trace_vhost_vdpa_dump_config(dev, line); + } +} + +static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data, + uint32_t offset, uint32_t size, + uint32_t flags) +{ + struct vhost_vdpa_config *config; + int ret; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + + trace_vhost_vdpa_set_config(dev, offset, size, flags); + config = g_malloc(size + config_size); + config->off = offset; + config->len = size; + memcpy(config->buf, data, size); + if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) && + trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { + vhost_vdpa_dump_config(dev, data, size); + } + ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config); + g_free(config); + return ret; +} + +static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config, + uint32_t config_len, Error **errp) +{ + struct vhost_vdpa_config *v_config; + unsigned long config_size = offsetof(struct vhost_vdpa_config, buf); + int ret; + + trace_vhost_vdpa_get_config(dev, config, config_len); + v_config = g_malloc(config_len + config_size); + v_config->len = config_len; + v_config->off = 0; + ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config); + memcpy(config, v_config->buf, config_len); + g_free(v_config); + if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) && + trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) { + vhost_vdpa_dump_config(dev, config, config_len); + } + return ret; + } + +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num); + return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring); +} + +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd); + return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file); +} + +static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd); + return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file); +} + +static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags, + addr->desc_user_addr, addr->used_user_addr, + addr->avail_user_addr, + addr->log_guest_addr); + + return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr); + +} + +/** + * Set the shadow virtqueue descriptors to the device + * + * @dev: The vhost device model + * @svq: The shadow virtqueue + * @idx: The index of the virtqueue in the vhost device + * @errp: Error + * + * Note that this function does not rewind kick file descriptor if cannot set + * call one. + */ +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev, + VhostShadowVirtqueue *svq, unsigned idx, + Error **errp) +{ + struct vhost_vring_file file = { + .index = dev->vq_index + idx, + }; + const EventNotifier *event_notifier = &svq->hdev_kick; + int r; + + file.fd = event_notifier_get_fd(event_notifier); + r = vhost_vdpa_set_vring_dev_kick(dev, &file); + if (unlikely(r != 0)) { + error_setg_errno(errp, -r, "Can't set device kick fd"); + return r; + } + + event_notifier = &svq->hdev_call; + file.fd = event_notifier_get_fd(event_notifier); + r = vhost_vdpa_set_vring_dev_call(dev, &file); + if (unlikely(r != 0)) { + error_setg_errno(errp, -r, "Can't set device call fd"); + } + + return r; +} + +/** + * Unmap a SVQ area in the device + */ +static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr) +{ + const DMAMap needle = { + .translated_addr = addr, + }; + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle); + hwaddr size; + int r; + + if (unlikely(!result)) { + error_report("Unable to find SVQ address to unmap"); + return; + } + + size = ROUND_UP(result->size, qemu_real_host_page_size()); + r = vhost_vdpa_dma_unmap(v, result->iova, size); + if (unlikely(r < 0)) { + error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r); + return; + } + + vhost_iova_tree_remove(v->iova_tree, *result); +} + +static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev, + const VhostShadowVirtqueue *svq) +{ + struct vhost_vdpa *v = dev->opaque; + struct vhost_vring_addr svq_addr; + + vhost_svq_get_vring_addr(svq, &svq_addr); + + vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr); + + vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr); +} + +/** + * Map the SVQ area in the device + * + * @v: Vhost-vdpa device + * @needle: The area to search iova + * @errorp: Error pointer + */ +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle, + Error **errp) +{ + int r; + + r = vhost_iova_tree_map_alloc(v->iova_tree, needle); + if (unlikely(r != IOVA_OK)) { + error_setg(errp, "Cannot allocate iova (%d)", r); + return false; + } + + r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1, + (void *)(uintptr_t)needle->translated_addr, + needle->perm == IOMMU_RO); + if (unlikely(r != 0)) { + error_setg_errno(errp, -r, "Cannot map region to device"); + vhost_iova_tree_remove(v->iova_tree, *needle); + } + + return r == 0; +} + +/** + * Map the shadow virtqueue rings in the device + * + * @dev: The vhost device + * @svq: The shadow virtqueue + * @addr: Assigned IOVA addresses + * @errp: Error pointer + */ +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev, + const VhostShadowVirtqueue *svq, + struct vhost_vring_addr *addr, + Error **errp) +{ + DMAMap device_region, driver_region; + struct vhost_vring_addr svq_addr; + struct vhost_vdpa *v = dev->opaque; + size_t device_size = vhost_svq_device_area_size(svq); + size_t driver_size = vhost_svq_driver_area_size(svq); + size_t avail_offset; + bool ok; + + ERRP_GUARD(); + vhost_svq_get_vring_addr(svq, &svq_addr); + + driver_region = (DMAMap) { + .translated_addr = svq_addr.desc_user_addr, + .size = driver_size - 1, + .perm = IOMMU_RO, + }; + ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp); + if (unlikely(!ok)) { + error_prepend(errp, "Cannot create vq driver region: "); + return false; + } + addr->desc_user_addr = driver_region.iova; + avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr; + addr->avail_user_addr = driver_region.iova + avail_offset; + + device_region = (DMAMap) { + .translated_addr = svq_addr.used_user_addr, + .size = device_size - 1, + .perm = IOMMU_RW, + }; + ok = vhost_vdpa_svq_map_ring(v, &device_region, errp); + if (unlikely(!ok)) { + error_prepend(errp, "Cannot create vq device region: "); + vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr); + } + addr->used_user_addr = device_region.iova; + + return ok; +} + +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev, + VhostShadowVirtqueue *svq, unsigned idx, + Error **errp) +{ + uint16_t vq_index = dev->vq_index + idx; + struct vhost_vring_state s = { + .index = vq_index, + }; + int r; + + r = vhost_vdpa_set_dev_vring_base(dev, &s); + if (unlikely(r)) { + error_setg_errno(errp, -r, "Cannot set vring base"); + return false; + } + + r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp); + return r == 0; +} + +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + Error *err = NULL; + unsigned i; + + if (!v->shadow_vqs) { + return true; + } + + for (i = 0; i < v->shadow_vqs->len; ++i) { + VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i); + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); + struct vhost_vring_addr addr = { + .index = dev->vq_index + i, + }; + int r; + bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err); + if (unlikely(!ok)) { + goto err; + } + + vhost_svq_start(svq, dev->vdev, vq); + ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err); + if (unlikely(!ok)) { + goto err_map; + } + + /* Override vring GPA set by vhost subsystem */ + r = vhost_vdpa_set_vring_dev_addr(dev, &addr); + if (unlikely(r != 0)) { + error_setg_errno(&err, -r, "Cannot set device address"); + goto err_set_addr; + } + } + + return true; + +err_set_addr: + vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i)); + +err_map: + vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i)); + +err: + error_reportf_err(err, "Cannot setup SVQ %u: ", i); + for (unsigned j = 0; j < i; ++j) { + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j); + vhost_vdpa_svq_unmap_rings(dev, svq); + vhost_svq_stop(svq); + } + + return false; +} + +static void vhost_vdpa_svqs_stop(struct vhost_dev *dev) +{ + struct vhost_vdpa *v = dev->opaque; + + if (!v->shadow_vqs) { + return; + } + + for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i); + vhost_vdpa_svq_unmap_rings(dev, svq); + } +} + +static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started) +{ + struct vhost_vdpa *v = dev->opaque; + bool ok; + trace_vhost_vdpa_dev_start(dev, started); + + if (started) { + vhost_vdpa_host_notifiers_init(dev); + ok = vhost_vdpa_svqs_start(dev); + if (unlikely(!ok)) { + return -1; + } + vhost_vdpa_set_vring_ready(dev); + } else { + vhost_vdpa_svqs_stop(dev); + vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs); + } + + if (dev->vq_index + dev->nvqs != dev->vq_index_end) { + return 0; + } + + if (started) { + memory_listener_register(&v->listener, &address_space_memory); + return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + } else { + vhost_vdpa_reset_device(dev); + vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE | + VIRTIO_CONFIG_S_DRIVER); + memory_listener_unregister(&v->listener); + + return 0; + } +} + +static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) +{ + struct vhost_vdpa *v = dev->opaque; + if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) { + return 0; + } + + trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd, + log->log); + return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base); +} + +static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + struct vhost_vdpa *v = dev->opaque; + + if (v->shadow_vqs_enabled) { + /* + * Device vring addr was set at device start. SVQ base is handled by + * VirtQueue code. + */ + return 0; + } + + return vhost_vdpa_set_vring_dev_addr(dev, addr); +} + +static int vhost_vdpa_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num); + return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring); +} + +static int vhost_vdpa_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + struct vhost_vdpa *v = dev->opaque; + VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index); + + /* + * vhost-vdpa devices does not support in-flight requests. Set all of them + * as available. + * + * TODO: This is ok for networking, but other kinds of devices might + * have problems with these retransmissions. + */ + while (virtqueue_rewind(vq, 1)) { + continue; + } + if (v->shadow_vqs_enabled) { + /* + * Device vring base was set at device start. SVQ base is handled by + * VirtQueue code. + */ + return 0; + } + + return vhost_vdpa_set_dev_vring_base(dev, ring); +} + +static int vhost_vdpa_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + struct vhost_vdpa *v = dev->opaque; + int ret; + + if (v->shadow_vqs_enabled) { + ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index); + return 0; + } + + ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring); + trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num); + return ret; +} + +static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + struct vhost_vdpa *v = dev->opaque; + int vdpa_idx = file->index - dev->vq_index; + + if (v->shadow_vqs_enabled) { + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); + vhost_svq_set_svq_kick_fd(svq, file->fd); + return 0; + } else { + return vhost_vdpa_set_vring_dev_kick(dev, file); + } +} + +static int vhost_vdpa_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + struct vhost_vdpa *v = dev->opaque; + + if (v->shadow_vqs_enabled) { + int vdpa_idx = file->index - dev->vq_index; + VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx); + + vhost_svq_set_svq_call_fd(svq, file->fd); + return 0; + } else { + return vhost_vdpa_set_vring_dev_call(dev, file); + } +} + +static int vhost_vdpa_get_features(struct vhost_dev *dev, + uint64_t *features) +{ + struct vhost_vdpa *v = dev->opaque; + int ret = vhost_vdpa_get_dev_features(dev, features); + + if (ret == 0 && v->shadow_vqs_enabled) { + /* Add SVQ logging capabilities */ + *features |= BIT_ULL(VHOST_F_LOG_ALL); + } + + return ret; +} + +static int vhost_vdpa_set_owner(struct vhost_dev *dev) +{ + if (!vhost_vdpa_first_dev(dev)) { + return 0; + } + + trace_vhost_vdpa_set_owner(dev); + return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL); +} + +static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr, struct vhost_virtqueue *vq) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA); + addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys; + addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys; + addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys; + trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr, + addr->avail_user_addr, addr->used_user_addr); + return 0; +} + +static bool vhost_vdpa_force_iommu(struct vhost_dev *dev) +{ + return true; +} + +const VhostOps vdpa_ops = { + .backend_type = VHOST_BACKEND_TYPE_VDPA, + .vhost_backend_init = vhost_vdpa_init, + .vhost_backend_cleanup = vhost_vdpa_cleanup, + .vhost_set_log_base = vhost_vdpa_set_log_base, + .vhost_set_vring_addr = vhost_vdpa_set_vring_addr, + .vhost_set_vring_num = vhost_vdpa_set_vring_num, + .vhost_set_vring_base = vhost_vdpa_set_vring_base, + .vhost_get_vring_base = vhost_vdpa_get_vring_base, + .vhost_set_vring_kick = vhost_vdpa_set_vring_kick, + .vhost_set_vring_call = vhost_vdpa_set_vring_call, + .vhost_get_features = vhost_vdpa_get_features, + .vhost_set_backend_cap = vhost_vdpa_set_backend_cap, + .vhost_set_owner = vhost_vdpa_set_owner, + .vhost_set_vring_endian = NULL, + .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit, + .vhost_set_mem_table = vhost_vdpa_set_mem_table, + .vhost_set_features = vhost_vdpa_set_features, + .vhost_reset_device = vhost_vdpa_reset_device, + .vhost_get_vq_index = vhost_vdpa_get_vq_index, + .vhost_get_config = vhost_vdpa_get_config, + .vhost_set_config = vhost_vdpa_set_config, + .vhost_requires_shm_log = NULL, + .vhost_migration_done = NULL, + .vhost_backend_can_merge = NULL, + .vhost_net_set_mtu = NULL, + .vhost_set_iotlb_callback = NULL, + .vhost_send_device_iotlb_msg = NULL, + .vhost_dev_start = vhost_vdpa_dev_start, + .vhost_get_device_id = vhost_vdpa_get_device_id, + .vhost_vq_get_addr = vhost_vdpa_vq_get_addr, + .vhost_force_iommu = vhost_vdpa_force_iommu, +}; diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c new file mode 100644 index 00000000..d21c72b4 --- /dev/null +++ b/hw/virtio/vhost-vsock-common.c @@ -0,0 +1,300 @@ +/* + * Parent class for vhost-vsock devices + * + * Copyright 2015-2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "standard-headers/linux/virtio_vsock.h" +#include "qapi/error.h" +#include "hw/virtio/virtio-access.h" +#include "qemu/error-report.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-vsock.h" +#include "qemu/iov.h" +#include "monitor/monitor.h" + +const int feature_bits[] = { + VIRTIO_VSOCK_F_SEQPACKET, + VIRTIO_F_RING_RESET, + VHOST_INVALID_FEATURE_BIT +}; + +uint64_t vhost_vsock_common_get_features(VirtIODevice *vdev, uint64_t features, + Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + if (vvc->seqpacket != ON_OFF_AUTO_OFF) { + virtio_add_feature(&features, VIRTIO_VSOCK_F_SEQPACKET); + } + + features = vhost_get_features(&vvc->vhost_dev, feature_bits, features); + + if (vvc->seqpacket == ON_OFF_AUTO_ON && + !virtio_has_feature(features, VIRTIO_VSOCK_F_SEQPACKET)) { + error_setg(errp, "vhost-vsock backend doesn't support seqpacket"); + } + + return features; +} + +int vhost_vsock_common_start(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + int i; + + if (!k->set_guest_notifiers) { + error_report("binding does not support guest notifiers"); + return -ENOSYS; + } + + ret = vhost_dev_enable_notifiers(&vvc->vhost_dev, vdev); + if (ret < 0) { + error_report("Error enabling host notifiers: %d", -ret); + return ret; + } + + ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, true); + if (ret < 0) { + error_report("Error binding guest notifier: %d", -ret); + goto err_host_notifiers; + } + + vvc->vhost_dev.acked_features = vdev->guest_features; + ret = vhost_dev_start(&vvc->vhost_dev, vdev, true); + if (ret < 0) { + error_report("Error starting vhost: %d", -ret); + goto err_guest_notifiers; + } + + /* + * guest_notifier_mask/pending not used yet, so just unmask + * everything here. virtio-pci will do the right thing by + * enabling/disabling irqfd. + */ + for (i = 0; i < vvc->vhost_dev.nvqs; i++) { + vhost_virtqueue_mask(&vvc->vhost_dev, vdev, i, false); + } + + return 0; + +err_guest_notifiers: + k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false); +err_host_notifiers: + vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev); + return ret; +} + +void vhost_vsock_common_stop(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int ret; + + if (!k->set_guest_notifiers) { + return; + } + + vhost_dev_stop(&vvc->vhost_dev, vdev, true); + + ret = k->set_guest_notifiers(qbus->parent, vvc->vhost_dev.nvqs, false); + if (ret < 0) { + error_report("vhost guest notifier cleanup failed: %d", ret); + return; + } + + vhost_dev_disable_notifiers(&vvc->vhost_dev, vdev); +} + + +static void vhost_vsock_common_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + /* Do nothing */ +} + +static void vhost_vsock_common_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + vhost_virtqueue_mask(&vvc->vhost_dev, vdev, idx, mask); +} + +static bool vhost_vsock_common_guest_notifier_pending(VirtIODevice *vdev, + int idx) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + return vhost_virtqueue_pending(&vvc->vhost_dev, idx); +} + +static void vhost_vsock_common_send_transport_reset(VHostVSockCommon *vvc) +{ + VirtQueueElement *elem; + VirtQueue *vq = vvc->event_vq; + struct virtio_vsock_event event = { + .id = cpu_to_le32(VIRTIO_VSOCK_EVENT_TRANSPORT_RESET), + }; + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + error_report("vhost-vsock missed transport reset event"); + return; + } + + if (elem->out_num) { + error_report("invalid vhost-vsock event virtqueue element with " + "out buffers"); + goto err; + } + + if (iov_from_buf(elem->in_sg, elem->in_num, 0, + &event, sizeof(event)) != sizeof(event)) { + error_report("vhost-vsock event virtqueue element is too short"); + goto err; + } + + virtqueue_push(vq, elem, sizeof(event)); + virtio_notify(VIRTIO_DEVICE(vvc), vq); + + g_free(elem); + return; + +err: + virtqueue_detach_element(vq, elem, 0); + g_free(elem); +} + +static void vhost_vsock_common_post_load_timer_cleanup(VHostVSockCommon *vvc) +{ + if (!vvc->post_load_timer) { + return; + } + + timer_free(vvc->post_load_timer); + vvc->post_load_timer = NULL; +} + +static void vhost_vsock_common_post_load_timer_cb(void *opaque) +{ + VHostVSockCommon *vvc = opaque; + + vhost_vsock_common_post_load_timer_cleanup(vvc); + vhost_vsock_common_send_transport_reset(vvc); +} + +int vhost_vsock_common_pre_save(void *opaque) +{ + VHostVSockCommon *vvc = opaque; + + /* + * At this point, backend must be stopped, otherwise + * it might keep writing to memory. + */ + assert(!vhost_dev_is_started(&vvc->vhost_dev)); + + return 0; +} + +int vhost_vsock_common_post_load(void *opaque, int version_id) +{ + VHostVSockCommon *vvc = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(vvc); + + if (virtio_queue_get_addr(vdev, 2)) { + /* + * Defer transport reset event to a vm clock timer so that virtqueue + * changes happen after migration has completed. + */ + assert(!vvc->post_load_timer); + vvc->post_load_timer = + timer_new_ns(QEMU_CLOCK_VIRTUAL, + vhost_vsock_common_post_load_timer_cb, + vvc); + timer_mod(vvc->post_load_timer, 1); + } + return 0; +} + +void vhost_vsock_common_realize(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + virtio_init(vdev, VIRTIO_ID_VSOCK, sizeof(struct virtio_vsock_config)); + + /* Receive and transmit queues belong to vhost */ + vvc->recv_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + vvc->trans_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + + /* The event queue belongs to QEMU */ + vvc->event_vq = virtio_add_queue(vdev, VHOST_VSOCK_QUEUE_SIZE, + vhost_vsock_common_handle_output); + + vvc->vhost_dev.nvqs = ARRAY_SIZE(vvc->vhost_vqs); + vvc->vhost_dev.vqs = vvc->vhost_vqs; + + vvc->post_load_timer = NULL; +} + +void vhost_vsock_common_unrealize(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + + vhost_vsock_common_post_load_timer_cleanup(vvc); + + virtio_delete_queue(vvc->recv_vq); + virtio_delete_queue(vvc->trans_vq); + virtio_delete_queue(vvc->event_vq); + virtio_cleanup(vdev); +} + +static struct vhost_dev *vhost_vsock_common_get_vhost(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + return &vvc->vhost_dev; +} + +static Property vhost_vsock_common_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("seqpacket", VHostVSockCommon, seqpacket, + ON_OFF_AUTO_AUTO), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_vsock_common_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vhost_vsock_common_properties); + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->guest_notifier_mask = vhost_vsock_common_guest_notifier_mask; + vdc->guest_notifier_pending = vhost_vsock_common_guest_notifier_pending; + vdc->get_vhost = vhost_vsock_common_get_vhost; +} + +static const TypeInfo vhost_vsock_common_info = { + .name = TYPE_VHOST_VSOCK_COMMON, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VHostVSockCommon), + .class_init = vhost_vsock_common_class_init, + .abstract = true, +}; + +static void vhost_vsock_common_register_types(void) +{ + type_register_static(&vhost_vsock_common_info); +} + +type_init(vhost_vsock_common_register_types) diff --git a/hw/virtio/vhost-vsock-pci.c b/hw/virtio/vhost-vsock-pci.c new file mode 100644 index 00000000..9f34414d --- /dev/null +++ b/hw/virtio/vhost-vsock-pci.c @@ -0,0 +1,96 @@ +/* + * Vhost vsock PCI Bindings + * + * Copyright 2015 Red Hat, Inc. + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-vsock.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VHostVSockPCI VHostVSockPCI; + +/* + * vhost-vsock-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VHOST_VSOCK_PCI "vhost-vsock-pci-base" +DECLARE_INSTANCE_CHECKER(VHostVSockPCI, VHOST_VSOCK_PCI, + TYPE_VHOST_VSOCK_PCI) + +struct VHostVSockPCI { + VirtIOPCIProxy parent_obj; + VHostVSock vdev; +}; + +/* vhost-vsock-pci */ + +static Property vhost_vsock_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_vsock_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VHostVSockPCI *dev = VHOST_VSOCK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIODevice *virtio_dev = VIRTIO_DEVICE(vdev); + + /* + * To avoid migration issues, we force virtio version 1 only when + * legacy check is enabled in the new machine types (>= 5.1). + */ + if (!virtio_legacy_check_disabled(virtio_dev)) { + virtio_pci_force_virtio_1(vpci_dev); + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void vhost_vsock_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = vhost_vsock_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, vhost_vsock_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_VSOCK; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void vhost_vsock_pci_instance_init(Object *obj) +{ + VHostVSockPCI *dev = VHOST_VSOCK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VHOST_VSOCK); +} + +static const VirtioPCIDeviceTypeInfo vhost_vsock_pci_info = { + .base_name = TYPE_VHOST_VSOCK_PCI, + .generic_name = "vhost-vsock-pci", + .non_transitional_name = "vhost-vsock-pci-non-transitional", + .instance_size = sizeof(VHostVSockPCI), + .instance_init = vhost_vsock_pci_instance_init, + .class_init = vhost_vsock_pci_class_init, +}; + +static void virtio_pci_vhost_register(void) +{ + virtio_pci_types_register(&vhost_vsock_pci_info); +} + +type_init(virtio_pci_vhost_register) diff --git a/hw/virtio/vhost-vsock.c b/hw/virtio/vhost-vsock.c new file mode 100644 index 00000000..aa16d584 --- /dev/null +++ b/hw/virtio/vhost-vsock.c @@ -0,0 +1,239 @@ +/* + * Virtio vsock device + * + * Copyright 2015 Red Hat, Inc. + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "standard-headers/linux/virtio_vsock.h" +#include "qapi/error.h" +#include "hw/virtio/virtio-access.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/vhost-vsock.h" +#include "monitor/monitor.h" + +static void vhost_vsock_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VHostVSock *vsock = VHOST_VSOCK(vdev); + struct virtio_vsock_config vsockcfg = {}; + + virtio_stq_p(vdev, &vsockcfg.guest_cid, vsock->conf.guest_cid); + memcpy(config, &vsockcfg, sizeof(vsockcfg)); +} + +static int vhost_vsock_set_guest_cid(VirtIODevice *vdev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + VHostVSock *vsock = VHOST_VSOCK(vdev); + const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops; + int ret; + + if (!vhost_ops->vhost_vsock_set_guest_cid) { + return -ENOSYS; + } + + ret = vhost_ops->vhost_vsock_set_guest_cid(&vvc->vhost_dev, + vsock->conf.guest_cid); + if (ret < 0) { + return -errno; + } + return 0; +} + +static int vhost_vsock_set_running(VirtIODevice *vdev, int start) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + const VhostOps *vhost_ops = vvc->vhost_dev.vhost_ops; + int ret; + + if (!vhost_ops->vhost_vsock_set_running) { + return -ENOSYS; + } + + ret = vhost_ops->vhost_vsock_set_running(&vvc->vhost_dev, start); + if (ret < 0) { + return -errno; + } + return 0; +} + + +static void vhost_vsock_set_status(VirtIODevice *vdev, uint8_t status) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(vdev); + bool should_start = virtio_device_should_start(vdev, status); + int ret; + + if (vhost_dev_is_started(&vvc->vhost_dev) == should_start) { + return; + } + + if (should_start) { + ret = vhost_vsock_common_start(vdev); + if (ret < 0) { + return; + } + + ret = vhost_vsock_set_running(vdev, 1); + if (ret < 0) { + vhost_vsock_common_stop(vdev); + error_report("Error starting vhost vsock: %d", -ret); + return; + } + } else { + ret = vhost_vsock_set_running(vdev, 0); + if (ret < 0) { + error_report("vhost vsock set running failed: %d", ret); + return; + } + + vhost_vsock_common_stop(vdev); + } +} + +static uint64_t vhost_vsock_get_features(VirtIODevice *vdev, + uint64_t requested_features, + Error **errp) +{ + return vhost_vsock_common_get_features(vdev, requested_features, errp); +} + +static const VMStateDescription vmstate_virtio_vhost_vsock = { + .name = "virtio-vhost_vsock", + .minimum_version_id = VHOST_VSOCK_SAVEVM_VERSION, + .version_id = VHOST_VSOCK_SAVEVM_VERSION, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, + .pre_save = vhost_vsock_common_pre_save, + .post_load = vhost_vsock_common_post_load, +}; + +static void vhost_vsock_device_realize(DeviceState *dev, Error **errp) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostVSock *vsock = VHOST_VSOCK(dev); + int vhostfd; + int ret; + + /* Refuse to use reserved CID numbers */ + if (vsock->conf.guest_cid <= 2) { + error_setg(errp, "guest-cid property must be greater than 2"); + return; + } + + if (vsock->conf.guest_cid > UINT32_MAX) { + error_setg(errp, "guest-cid property must be a 32-bit number"); + return; + } + + if (vsock->conf.vhostfd) { + vhostfd = monitor_fd_param(monitor_cur(), vsock->conf.vhostfd, errp); + if (vhostfd == -1) { + error_prepend(errp, "vhost-vsock: unable to parse vhostfd: "); + return; + } + + if (!g_unix_set_fd_nonblocking(vhostfd, true, NULL)) { + error_setg_errno(errp, errno, + "vhost-vsock: unable to set non-blocking mode"); + return; + } + } else { + vhostfd = open("/dev/vhost-vsock", O_RDWR); + if (vhostfd < 0) { + error_setg_errno(errp, errno, + "vhost-vsock: failed to open vhost device"); + return; + } + + if (!g_unix_set_fd_nonblocking(vhostfd, true, NULL)) { + error_setg_errno(errp, errno, + "Failed to set FD nonblocking"); + return; + } + } + + vhost_vsock_common_realize(vdev); + + ret = vhost_dev_init(&vvc->vhost_dev, (void *)(uintptr_t)vhostfd, + VHOST_BACKEND_TYPE_KERNEL, 0, errp); + if (ret < 0) { + /* + * vhostfd is closed by vhost_dev_cleanup, which is called + * by vhost_dev_init on initialization error. + */ + goto err_virtio; + } + + ret = vhost_vsock_set_guest_cid(vdev); + if (ret < 0) { + error_setg_errno(errp, -ret, "vhost-vsock: unable to set guest cid"); + goto err_vhost_dev; + } + + return; + +err_vhost_dev: + /* vhost_dev_cleanup() closes the vhostfd passed to vhost_dev_init() */ + vhost_dev_cleanup(&vvc->vhost_dev); +err_virtio: + vhost_vsock_common_unrealize(vdev); +} + +static void vhost_vsock_device_unrealize(DeviceState *dev) +{ + VHostVSockCommon *vvc = VHOST_VSOCK_COMMON(dev); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + + /* This will stop vhost backend if appropriate. */ + vhost_vsock_set_status(vdev, 0); + + vhost_dev_cleanup(&vvc->vhost_dev); + vhost_vsock_common_unrealize(vdev); +} + +static Property vhost_vsock_properties[] = { + DEFINE_PROP_UINT64("guest-cid", VHostVSock, conf.guest_cid, 0), + DEFINE_PROP_STRING("vhostfd", VHostVSock, conf.vhostfd), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vhost_vsock_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, vhost_vsock_properties); + dc->vmsd = &vmstate_virtio_vhost_vsock; + vdc->realize = vhost_vsock_device_realize; + vdc->unrealize = vhost_vsock_device_unrealize; + vdc->get_features = vhost_vsock_get_features; + vdc->get_config = vhost_vsock_get_config; + vdc->set_status = vhost_vsock_set_status; +} + +static const TypeInfo vhost_vsock_info = { + .name = TYPE_VHOST_VSOCK, + .parent = TYPE_VHOST_VSOCK_COMMON, + .instance_size = sizeof(VHostVSock), + .class_init = vhost_vsock_class_init, +}; + +static void vhost_vsock_register_types(void) +{ + type_register_static(&vhost_vsock_info); +} + +type_init(vhost_vsock_register_types) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c new file mode 100644 index 00000000..7fb008bc --- /dev/null +++ b/hw/virtio/vhost.c @@ -0,0 +1,1942 @@ +/* + * vhost support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "hw/virtio/vhost.h" +#include "qemu/atomic.h" +#include "qemu/range.h" +#include "qemu/error-report.h" +#include "qemu/memfd.h" +#include "standard-headers/linux/vhost_types.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "migration/blocker.h" +#include "migration/qemu-file-types.h" +#include "sysemu/dma.h" +#include "trace.h" + +/* enabled until disconnected backend stabilizes */ +#define _VHOST_DEBUG 1 + +#ifdef _VHOST_DEBUG +#define VHOST_OPS_DEBUG(retval, fmt, ...) \ + do { \ + error_report(fmt ": %s (%d)", ## __VA_ARGS__, \ + strerror(-retval), -retval); \ + } while (0) +#else +#define VHOST_OPS_DEBUG(retval, fmt, ...) \ + do { } while (0) +#endif + +static struct vhost_log *vhost_log; +static struct vhost_log *vhost_log_shm; + +static unsigned int used_memslots; +static QLIST_HEAD(, vhost_dev) vhost_devices = + QLIST_HEAD_INITIALIZER(vhost_devices); + +bool vhost_has_free_slot(void) +{ + unsigned int slots_limit = ~0U; + struct vhost_dev *hdev; + + QLIST_FOREACH(hdev, &vhost_devices, entry) { + unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); + slots_limit = MIN(slots_limit, r); + } + return slots_limit > used_memslots; +} + +static void vhost_dev_sync_region(struct vhost_dev *dev, + MemoryRegionSection *section, + uint64_t mfirst, uint64_t mlast, + uint64_t rfirst, uint64_t rlast) +{ + vhost_log_chunk_t *log = dev->log->log; + + uint64_t start = MAX(mfirst, rfirst); + uint64_t end = MIN(mlast, rlast); + vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK; + vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1; + uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK); + + if (end < start) { + return; + } + assert(end / VHOST_LOG_CHUNK < dev->log_size); + assert(start / VHOST_LOG_CHUNK < dev->log_size); + + for (;from < to; ++from) { + vhost_log_chunk_t log; + /* We first check with non-atomic: much cheaper, + * and we expect non-dirty to be the common case. */ + if (!*from) { + addr += VHOST_LOG_CHUNK; + continue; + } + /* Data must be read atomically. We don't really need barrier semantics + * but it's easier to use atomic_* than roll our own. */ + log = qatomic_xchg(from, 0); + while (log) { + int bit = ctzl(log); + hwaddr page_addr; + hwaddr section_offset; + hwaddr mr_offset; + page_addr = addr + bit * VHOST_LOG_PAGE; + section_offset = page_addr - section->offset_within_address_space; + mr_offset = section_offset + section->offset_within_region; + memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE); + log &= ~(0x1ull << bit); + } + addr += VHOST_LOG_CHUNK; + } +} + +static int vhost_sync_dirty_bitmap(struct vhost_dev *dev, + MemoryRegionSection *section, + hwaddr first, + hwaddr last) +{ + int i; + hwaddr start_addr; + hwaddr end_addr; + + if (!dev->log_enabled || !dev->started) { + return 0; + } + start_addr = section->offset_within_address_space; + end_addr = range_get_last(start_addr, int128_get64(section->size)); + start_addr = MAX(first, start_addr); + end_addr = MIN(last, end_addr); + + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + vhost_dev_sync_region(dev, section, start_addr, end_addr, + reg->guest_phys_addr, + range_get_last(reg->guest_phys_addr, + reg->memory_size)); + } + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_virtqueue *vq = dev->vqs + i; + + if (!vq->used_phys && !vq->used_size) { + continue; + } + + vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys, + range_get_last(vq->used_phys, vq->used_size)); + } + return 0; +} + +static void vhost_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL); +} + +static void vhost_log_sync_range(struct vhost_dev *dev, + hwaddr first, hwaddr last) +{ + int i; + /* FIXME: this is N^2 in number of sections */ + for (i = 0; i < dev->n_mem_sections; ++i) { + MemoryRegionSection *section = &dev->mem_sections[i]; + vhost_sync_dirty_bitmap(dev, section, first, last); + } +} + +static uint64_t vhost_get_log_size(struct vhost_dev *dev) +{ + uint64_t log_size = 0; + int i; + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + uint64_t last = range_get_last(reg->guest_phys_addr, + reg->memory_size); + log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1); + } + return log_size; +} + +static int vhost_set_backend_type(struct vhost_dev *dev, + VhostBackendType backend_type) +{ + int r = 0; + + switch (backend_type) { +#ifdef CONFIG_VHOST_KERNEL + case VHOST_BACKEND_TYPE_KERNEL: + dev->vhost_ops = &kernel_ops; + break; +#endif +#ifdef CONFIG_VHOST_USER + case VHOST_BACKEND_TYPE_USER: + dev->vhost_ops = &user_ops; + break; +#endif +#ifdef CONFIG_VHOST_VDPA + case VHOST_BACKEND_TYPE_VDPA: + dev->vhost_ops = &vdpa_ops; + break; +#endif + default: + error_report("Unknown vhost backend type"); + r = -1; + } + + return r; +} + +static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) +{ + Error *err = NULL; + struct vhost_log *log; + uint64_t logsize = size * sizeof(*(log->log)); + int fd = -1; + + log = g_new0(struct vhost_log, 1); + if (share) { + log->log = qemu_memfd_alloc("vhost-log", logsize, + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, + &fd, &err); + if (err) { + error_report_err(err); + g_free(log); + return NULL; + } + memset(log->log, 0, logsize); + } else { + log->log = g_malloc0(logsize); + } + + log->size = size; + log->refcnt = 1; + log->fd = fd; + + return log; +} + +static struct vhost_log *vhost_log_get(uint64_t size, bool share) +{ + struct vhost_log *log = share ? vhost_log_shm : vhost_log; + + if (!log || log->size != size) { + log = vhost_log_alloc(size, share); + if (share) { + vhost_log_shm = log; + } else { + vhost_log = log; + } + } else { + ++log->refcnt; + } + + return log; +} + +static void vhost_log_put(struct vhost_dev *dev, bool sync) +{ + struct vhost_log *log = dev->log; + + if (!log) { + return; + } + + --log->refcnt; + if (log->refcnt == 0) { + /* Sync only the range covered by the old log */ + if (dev->log_size && sync) { + vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); + } + + if (vhost_log == log) { + g_free(log->log); + vhost_log = NULL; + } else if (vhost_log_shm == log) { + qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), + log->fd); + vhost_log_shm = NULL; + } + + g_free(log); + } + + dev->log = NULL; + dev->log_size = 0; +} + +static bool vhost_dev_log_is_shared(struct vhost_dev *dev) +{ + return dev->vhost_ops->vhost_requires_shm_log && + dev->vhost_ops->vhost_requires_shm_log(dev); +} + +static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) +{ + struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); + uint64_t log_base = (uintptr_t)log->log; + int r; + + /* inform backend of log switching, this must be done before + releasing the current log, to ensure no logging is lost */ + r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); + } + + vhost_log_put(dev, true); + dev->log = log; + dev->log_size = size; +} + +static bool vhost_dev_has_iommu(struct vhost_dev *dev) +{ + VirtIODevice *vdev = dev->vdev; + + /* + * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support + * incremental memory mapping API via IOTLB API. For platform that + * does not have IOMMU, there's no need to enable this feature + * which may cause unnecessary IOTLB miss/update transactions. + */ + if (vdev) { + return virtio_bus_device_iommu_enabled(vdev) && + virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + } else { + return false; + } +} + +static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr, + hwaddr *plen, bool is_write) +{ + if (!vhost_dev_has_iommu(dev)) { + return cpu_physical_memory_map(addr, plen, is_write); + } else { + return (void *)(uintptr_t)addr; + } +} + +static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer, + hwaddr len, int is_write, + hwaddr access_len) +{ + if (!vhost_dev_has_iommu(dev)) { + cpu_physical_memory_unmap(buffer, len, is_write, access_len); + } +} + +static int vhost_verify_ring_part_mapping(void *ring_hva, + uint64_t ring_gpa, + uint64_t ring_size, + void *reg_hva, + uint64_t reg_gpa, + uint64_t reg_size) +{ + uint64_t hva_ring_offset; + uint64_t ring_last = range_get_last(ring_gpa, ring_size); + uint64_t reg_last = range_get_last(reg_gpa, reg_size); + + if (ring_last < reg_gpa || ring_gpa > reg_last) { + return 0; + } + /* check that whole ring's is mapped */ + if (ring_last > reg_last) { + return -ENOMEM; + } + /* check that ring's MemoryRegion wasn't replaced */ + hva_ring_offset = ring_gpa - reg_gpa; + if (ring_hva != reg_hva + hva_ring_offset) { + return -EBUSY; + } + + return 0; +} + +static int vhost_verify_ring_mappings(struct vhost_dev *dev, + void *reg_hva, + uint64_t reg_gpa, + uint64_t reg_size) +{ + int i, j; + int r = 0; + const char *part_name[] = { + "descriptor table", + "available ring", + "used ring" + }; + + if (vhost_dev_has_iommu(dev)) { + return 0; + } + + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_virtqueue *vq = dev->vqs + i; + + if (vq->desc_phys == 0) { + continue; + } + + j = 0; + r = vhost_verify_ring_part_mapping( + vq->desc, vq->desc_phys, vq->desc_size, + reg_hva, reg_gpa, reg_size); + if (r) { + break; + } + + j++; + r = vhost_verify_ring_part_mapping( + vq->avail, vq->avail_phys, vq->avail_size, + reg_hva, reg_gpa, reg_size); + if (r) { + break; + } + + j++; + r = vhost_verify_ring_part_mapping( + vq->used, vq->used_phys, vq->used_size, + reg_hva, reg_gpa, reg_size); + if (r) { + break; + } + } + + if (r == -ENOMEM) { + error_report("Unable to map %s for ring %d", part_name[j], i); + } else if (r == -EBUSY) { + error_report("%s relocated for ring %d", part_name[j], i); + } + return r; +} + +/* + * vhost_section: identify sections needed for vhost access + * + * We only care about RAM sections here (where virtqueue and guest + * internals accessed by virtio might live). If we find one we still + * allow the backend to potentially filter it out of our list. + */ +static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section) +{ + MemoryRegion *mr = section->mr; + + if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) { + uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr); + uint8_t handled_dirty; + + /* + * Kernel based vhost doesn't handle any block which is doing + * dirty-tracking other than migration for which it has + * specific logging support. However for TCG the kernel never + * gets involved anyway so we can also ignore it's + * self-modiying code detection flags. However a vhost-user + * client could still confuse a TCG guest if it re-writes + * executable memory that has already been translated. + */ + handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) | + (1 << DIRTY_MEMORY_CODE); + + if (dirty_mask & ~handled_dirty) { + trace_vhost_reject_section(mr->name, 1); + return false; + } + + if (dev->vhost_ops->vhost_backend_mem_section_filter && + !dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) { + trace_vhost_reject_section(mr->name, 2); + return false; + } + + trace_vhost_section(mr->name); + return true; + } else { + trace_vhost_reject_section(mr->name, 3); + return false; + } +} + +static void vhost_begin(MemoryListener *listener) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + dev->tmp_sections = NULL; + dev->n_tmp_sections = 0; +} + +static void vhost_commit(MemoryListener *listener) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + MemoryRegionSection *old_sections; + int n_old_sections; + uint64_t log_size; + size_t regions_size; + int r; + int i; + bool changed = false; + + /* Note we can be called before the device is started, but then + * starting the device calls set_mem_table, so we need to have + * built the data structures. + */ + old_sections = dev->mem_sections; + n_old_sections = dev->n_mem_sections; + dev->mem_sections = dev->tmp_sections; + dev->n_mem_sections = dev->n_tmp_sections; + + if (dev->n_mem_sections != n_old_sections) { + changed = true; + } else { + /* Same size, lets check the contents */ + for (int i = 0; i < n_old_sections; i++) { + if (!MemoryRegionSection_eq(&old_sections[i], + &dev->mem_sections[i])) { + changed = true; + break; + } + } + } + + trace_vhost_commit(dev->started, changed); + if (!changed) { + goto out; + } + + /* Rebuild the regions list from the new sections list */ + regions_size = offsetof(struct vhost_memory, regions) + + dev->n_mem_sections * sizeof dev->mem->regions[0]; + dev->mem = g_realloc(dev->mem, regions_size); + dev->mem->nregions = dev->n_mem_sections; + used_memslots = dev->mem->nregions; + for (i = 0; i < dev->n_mem_sections; i++) { + struct vhost_memory_region *cur_vmr = dev->mem->regions + i; + struct MemoryRegionSection *mrs = dev->mem_sections + i; + + cur_vmr->guest_phys_addr = mrs->offset_within_address_space; + cur_vmr->memory_size = int128_get64(mrs->size); + cur_vmr->userspace_addr = + (uintptr_t)memory_region_get_ram_ptr(mrs->mr) + + mrs->offset_within_region; + cur_vmr->flags_padding = 0; + } + + if (!dev->started) { + goto out; + } + + for (i = 0; i < dev->mem->nregions; i++) { + if (vhost_verify_ring_mappings(dev, + (void *)(uintptr_t)dev->mem->regions[i].userspace_addr, + dev->mem->regions[i].guest_phys_addr, + dev->mem->regions[i].memory_size)) { + error_report("Verify ring failure on region %d", i); + abort(); + } + } + + if (!dev->log_enabled) { + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); + } + goto out; + } + log_size = vhost_get_log_size(dev); + /* We allocate an extra 4K bytes to log, + * to reduce the * number of reallocations. */ +#define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log) + /* To log more, must increase log size before table update. */ + if (dev->log_size < log_size) { + vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); + } + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); + } + /* To log less, can only decrease log size after table update. */ + if (dev->log_size > log_size + VHOST_LOG_BUFFER) { + vhost_dev_log_resize(dev, log_size); + } + +out: + /* Deref the old list of sections, this must happen _after_ the + * vhost_set_mem_table to ensure the client isn't still using the + * section we're about to unref. + */ + while (n_old_sections--) { + memory_region_unref(old_sections[n_old_sections].mr); + } + g_free(old_sections); + return; +} + +/* Adds the section data to the tmp_section structure. + * It relies on the listener calling us in memory address order + * and for each region (via the _add and _nop methods) to + * join neighbours. + */ +static void vhost_region_add_section(struct vhost_dev *dev, + MemoryRegionSection *section) +{ + bool need_add = true; + uint64_t mrs_size = int128_get64(section->size); + uint64_t mrs_gpa = section->offset_within_address_space; + uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + RAMBlock *mrs_rb = section->mr->ram_block; + + trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + + if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) { + /* Round the section to it's page size */ + /* First align the start down to a page boundary */ + size_t mrs_page = qemu_ram_pagesize(mrs_rb); + uint64_t alignage = mrs_host & (mrs_page - 1); + if (alignage) { + mrs_host -= alignage; + mrs_size += alignage; + mrs_gpa -= alignage; + } + /* Now align the size up to a page boundary */ + alignage = mrs_size & (mrs_page - 1); + if (alignage) { + mrs_size += mrs_page - alignage; + } + trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, + mrs_size, mrs_host); + } + + if (dev->n_tmp_sections) { + /* Since we already have at least one section, lets see if + * this extends it; since we're scanning in order, we only + * have to look at the last one, and the FlatView that calls + * us shouldn't have overlaps. + */ + MemoryRegionSection *prev_sec = dev->tmp_sections + + (dev->n_tmp_sections - 1); + uint64_t prev_gpa_start = prev_sec->offset_within_address_space; + uint64_t prev_size = int128_get64(prev_sec->size); + uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); + uint64_t prev_host_start = + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + + prev_sec->offset_within_region; + uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); + + if (mrs_gpa <= (prev_gpa_end + 1)) { + /* OK, looks like overlapping/intersecting - it's possible that + * the rounding to page sizes has made them overlap, but they should + * match up in the same RAMBlock if they do. + */ + if (mrs_gpa < prev_gpa_start) { + error_report("%s:Section '%s' rounded to %"PRIx64 + " prior to previous '%s' %"PRIx64, + __func__, section->mr->name, mrs_gpa, + prev_sec->mr->name, prev_gpa_start); + /* A way to cleanly fail here would be better */ + return; + } + /* Offset from the start of the previous GPA to this GPA */ + size_t offset = mrs_gpa - prev_gpa_start; + + if (prev_host_start + offset == mrs_host && + section->mr == prev_sec->mr && + (!dev->vhost_ops->vhost_backend_can_merge || + dev->vhost_ops->vhost_backend_can_merge(dev, + mrs_host, mrs_size, + prev_host_start, prev_size))) { + uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); + need_add = false; + prev_sec->offset_within_address_space = + MIN(prev_gpa_start, mrs_gpa); + prev_sec->offset_within_region = + MIN(prev_host_start, mrs_host) - + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); + prev_sec->size = int128_make64(max_end - MIN(prev_host_start, + mrs_host)); + trace_vhost_region_add_section_merge(section->mr->name, + int128_get64(prev_sec->size), + prev_sec->offset_within_address_space, + prev_sec->offset_within_region); + } else { + /* adjoining regions are fine, but overlapping ones with + * different blocks/offsets shouldn't happen + */ + if (mrs_gpa != prev_gpa_end + 1) { + error_report("%s: Overlapping but not coherent sections " + "at %"PRIx64, + __func__, mrs_gpa); + return; + } + } + } + } + + if (need_add) { + ++dev->n_tmp_sections; + dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections, + dev->n_tmp_sections); + dev->tmp_sections[dev->n_tmp_sections - 1] = *section; + /* The flatview isn't stable and we don't use it, making it NULL + * means we can memcmp the list. + */ + dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL; + memory_region_ref(section->mr); + } +} + +/* Used for both add and nop callbacks */ +static void vhost_region_addnop(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + + if (!vhost_section(dev, section)) { + return; + } + vhost_region_add_section(dev, section); +} + +static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n); + struct vhost_dev *hdev = iommu->hdev; + hwaddr iova = iotlb->iova + iommu->iommu_offset; + + if (vhost_backend_invalidate_device_iotlb(hdev, iova, + iotlb->addr_mask + 1)) { + error_report("Fail to invalidate device iotlb"); + } +} + +static void vhost_iommu_region_add(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + iommu_listener); + struct vhost_iommu *iommu; + Int128 end; + int iommu_idx; + IOMMUMemoryRegion *iommu_mr; + int ret; + + if (!memory_region_is_iommu(section->mr)) { + return; + } + + iommu_mr = IOMMU_MEMORY_REGION(section->mr); + + iommu = g_malloc0(sizeof(*iommu)); + end = int128_add(int128_make64(section->offset_within_region), + section->size); + end = int128_sub(end, int128_one()); + iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr, + MEMTXATTRS_UNSPECIFIED); + iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify, + IOMMU_NOTIFIER_DEVIOTLB_UNMAP, + section->offset_within_region, + int128_get64(end), + iommu_idx); + iommu->mr = section->mr; + iommu->iommu_offset = section->offset_within_address_space - + section->offset_within_region; + iommu->hdev = dev; + ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL); + if (ret) { + /* + * Some vIOMMUs do not support dev-iotlb yet. If so, try to use the + * UNMAP legacy message + */ + iommu->n.notifier_flags = IOMMU_NOTIFIER_UNMAP; + memory_region_register_iommu_notifier(section->mr, &iommu->n, + &error_fatal); + } + QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next); + /* TODO: can replay help performance here? */ +} + +static void vhost_iommu_region_del(MemoryListener *listener, + MemoryRegionSection *section) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + iommu_listener); + struct vhost_iommu *iommu; + + if (!memory_region_is_iommu(section->mr)) { + return; + } + + QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) { + if (iommu->mr == section->mr && + iommu->n.start == section->offset_within_region) { + memory_region_unregister_iommu_notifier(iommu->mr, + &iommu->n); + QLIST_REMOVE(iommu, iommu_next); + g_free(iommu); + break; + } + } +} + +static int vhost_virtqueue_set_addr(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned idx, bool enable_log) +{ + struct vhost_vring_addr addr; + int r; + memset(&addr, 0, sizeof(struct vhost_vring_addr)); + + if (dev->vhost_ops->vhost_vq_get_addr) { + r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed"); + return r; + } + } else { + addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc; + addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail; + addr.used_user_addr = (uint64_t)(unsigned long)vq->used; + } + addr.index = idx; + addr.log_guest_addr = vq->used_phys; + addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0; + r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed"); + } + return r; +} + +static int vhost_dev_set_features(struct vhost_dev *dev, + bool enable_log) +{ + uint64_t features = dev->acked_features; + int r; + if (enable_log) { + features |= 0x1ULL << VHOST_F_LOG_ALL; + } + if (!vhost_dev_has_iommu(dev)) { + features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM); + } + if (dev->vhost_ops->vhost_force_iommu) { + if (dev->vhost_ops->vhost_force_iommu(dev) == true) { + features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM; + } + } + r = dev->vhost_ops->vhost_set_features(dev, features); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_features failed"); + goto out; + } + if (dev->vhost_ops->vhost_set_backend_cap) { + r = dev->vhost_ops->vhost_set_backend_cap(dev); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed"); + goto out; + } + } + +out: + return r; +} + +static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) +{ + int r, i, idx; + hwaddr addr; + + r = vhost_dev_set_features(dev, enable_log); + if (r < 0) { + goto err_features; + } + for (i = 0; i < dev->nvqs; ++i) { + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + addr = virtio_queue_get_desc_addr(dev->vdev, idx); + if (!addr) { + /* + * The queue might not be ready for start. If this + * is the case there is no reason to continue the process. + * The similar logic is used by the vhost_virtqueue_start() + * routine. + */ + continue; + } + r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, + enable_log); + if (r < 0) { + goto err_vq; + } + } + return 0; +err_vq: + for (; i >= 0; --i) { + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + addr = virtio_queue_get_desc_addr(dev->vdev, idx); + if (!addr) { + continue; + } + vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, + dev->log_enabled); + } + vhost_dev_set_features(dev, dev->log_enabled); +err_features: + return r; +} + +static int vhost_migration_log(MemoryListener *listener, bool enable) +{ + struct vhost_dev *dev = container_of(listener, struct vhost_dev, + memory_listener); + int r; + if (enable == dev->log_enabled) { + return 0; + } + if (!dev->started) { + dev->log_enabled = enable; + return 0; + } + + r = 0; + if (!enable) { + r = vhost_dev_set_log(dev, false); + if (r < 0) { + goto check_dev_state; + } + vhost_log_put(dev, false); + } else { + vhost_dev_log_resize(dev, vhost_get_log_size(dev)); + r = vhost_dev_set_log(dev, true); + if (r < 0) { + goto check_dev_state; + } + } + +check_dev_state: + dev->log_enabled = enable; + /* + * vhost-user-* devices could change their state during log + * initialization due to disconnect. So check dev state after + * vhost communication. + */ + if (!dev->started) { + /* + * Since device is in the stopped state, it is okay for + * migration. Return success. + */ + r = 0; + } + if (r) { + /* An error occurred. */ + dev->log_enabled = false; + } + + return r; +} + +static void vhost_log_global_start(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, true); + if (r < 0) { + abort(); + } +} + +static void vhost_log_global_stop(MemoryListener *listener) +{ + int r; + + r = vhost_migration_log(listener, false); + if (r < 0) { + abort(); + } +} + +static void vhost_log_start(MemoryListener *listener, + MemoryRegionSection *section, + int old, int new) +{ + /* FIXME: implement */ +} + +static void vhost_log_stop(MemoryListener *listener, + MemoryRegionSection *section, + int old, int new) +{ + /* FIXME: implement */ +} + +/* The vhost driver natively knows how to handle the vrings of non + * cross-endian legacy devices and modern devices. Only legacy devices + * exposed to a bi-endian guest may require the vhost driver to use a + * specific endianness. + */ +static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + return false; + } +#if HOST_BIG_ENDIAN + return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; +#else + return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; +#endif +} + +static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, + bool is_big_endian, + int vhost_vq_index) +{ + int r; + struct vhost_vring_state s = { + .index = vhost_vq_index, + .num = is_big_endian + }; + + r = dev->vhost_ops->vhost_set_vring_endian(dev, &s); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed"); + } + return r; +} + +static int vhost_memory_region_lookup(struct vhost_dev *hdev, + uint64_t gpa, uint64_t *uaddr, + uint64_t *len) +{ + int i; + + for (i = 0; i < hdev->mem->nregions; i++) { + struct vhost_memory_region *reg = hdev->mem->regions + i; + + if (gpa >= reg->guest_phys_addr && + reg->guest_phys_addr + reg->memory_size > gpa) { + *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr; + *len = reg->guest_phys_addr + reg->memory_size - gpa; + return 0; + } + } + + return -EFAULT; +} + +int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write) +{ + IOMMUTLBEntry iotlb; + uint64_t uaddr, len; + int ret = -EFAULT; + + RCU_READ_LOCK_GUARD(); + + trace_vhost_iotlb_miss(dev, 1); + + iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as, + iova, write, + MEMTXATTRS_UNSPECIFIED); + if (iotlb.target_as != NULL) { + ret = vhost_memory_region_lookup(dev, iotlb.translated_addr, + &uaddr, &len); + if (ret) { + trace_vhost_iotlb_miss(dev, 3); + error_report("Fail to lookup the translated address " + "%"PRIx64, iotlb.translated_addr); + goto out; + } + + len = MIN(iotlb.addr_mask + 1, len); + iova = iova & ~iotlb.addr_mask; + + ret = vhost_backend_update_device_iotlb(dev, iova, uaddr, + len, iotlb.perm); + if (ret) { + trace_vhost_iotlb_miss(dev, 4); + error_report("Fail to update device iotlb"); + goto out; + } + } + + trace_vhost_iotlb_miss(dev, 2); + +out: + return ret; +} + +int vhost_virtqueue_start(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + unsigned idx) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusState *vbus = VIRTIO_BUS(qbus); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); + hwaddr s, l, a; + int r; + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); + struct vhost_vring_file file = { + .index = vhost_vq_index + }; + struct vhost_vring_state state = { + .index = vhost_vq_index + }; + struct VirtQueue *vvq = virtio_get_queue(vdev, idx); + + a = virtio_queue_get_desc_addr(vdev, idx); + if (a == 0) { + /* Queue might not be ready for start */ + return 0; + } + + vq->num = state.num = virtio_queue_get_num(vdev, idx); + r = dev->vhost_ops->vhost_set_vring_num(dev, &state); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed"); + return r; + } + + state.num = virtio_queue_get_last_avail_idx(vdev, idx); + r = dev->vhost_ops->vhost_set_vring_base(dev, &state); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed"); + return r; + } + + if (vhost_needs_vring_endian(vdev)) { + r = vhost_virtqueue_set_vring_endian_legacy(dev, + virtio_is_big_endian(vdev), + vhost_vq_index); + if (r) { + return r; + } + } + + vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); + vq->desc_phys = a; + vq->desc = vhost_memory_map(dev, a, &l, false); + if (!vq->desc || l != s) { + r = -ENOMEM; + goto fail_alloc_desc; + } + vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); + vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); + vq->avail = vhost_memory_map(dev, a, &l, false); + if (!vq->avail || l != s) { + r = -ENOMEM; + goto fail_alloc_avail; + } + vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx); + vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx); + vq->used = vhost_memory_map(dev, a, &l, true); + if (!vq->used || l != s) { + r = -ENOMEM; + goto fail_alloc_used; + } + + r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); + if (r < 0) { + goto fail_alloc; + } + + file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed"); + goto fail_kick; + } + + /* Clear and discard previous events if any. */ + event_notifier_test_and_clear(&vq->masked_notifier); + + /* Init vring in unmasked state, unless guest_notifier_mask + * will do it later. + */ + if (!vdev->use_guest_notifier_mask) { + /* TODO: check and handle errors. */ + vhost_virtqueue_mask(dev, vdev, idx, false); + } + + if (k->query_guest_notifiers && + k->query_guest_notifiers(qbus->parent) && + virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) { + file.fd = -1; + r = dev->vhost_ops->vhost_set_vring_call(dev, &file); + if (r) { + goto fail_vector; + } + } + + return 0; + +fail_vector: +fail_kick: +fail_alloc: + vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), + 0, 0); +fail_alloc_used: + vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), + 0, 0); +fail_alloc_avail: + vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), + 0, 0); +fail_alloc_desc: + return r; +} + +void vhost_virtqueue_stop(struct vhost_dev *dev, + struct VirtIODevice *vdev, + struct vhost_virtqueue *vq, + unsigned idx) +{ + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); + struct vhost_vring_state state = { + .index = vhost_vq_index, + }; + int r; + + if (virtio_queue_get_desc_addr(vdev, idx) == 0) { + /* Don't stop the virtqueue which might have not been started */ + return; + } + + r = dev->vhost_ops->vhost_get_vring_base(dev, &state); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r); + /* Connection to the backend is broken, so let's sync internal + * last avail idx to the device used idx. + */ + virtio_queue_restore_last_avail_idx(vdev, idx); + } else { + virtio_queue_set_last_avail_idx(vdev, idx, state.num); + } + virtio_queue_invalidate_signalled_used(vdev, idx); + virtio_queue_update_used_idx(vdev, idx); + + /* In the cross-endian case, we need to reset the vring endianness to + * native as legacy devices expect so by default. + */ + if (vhost_needs_vring_endian(vdev)) { + vhost_virtqueue_set_vring_endian_legacy(dev, + !virtio_is_big_endian(vdev), + vhost_vq_index); + } + + vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx), + 1, virtio_queue_get_used_size(vdev, idx)); + vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx), + 0, virtio_queue_get_avail_size(vdev, idx)); + vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx), + 0, virtio_queue_get_desc_size(vdev, idx)); +} + +static void vhost_eventfd_add(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e) +{ +} + +static void vhost_eventfd_del(MemoryListener *listener, + MemoryRegionSection *section, + bool match_data, uint64_t data, EventNotifier *e) +{ +} + +static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev, + int n, uint32_t timeout) +{ + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); + struct vhost_vring_state state = { + .index = vhost_vq_index, + .num = timeout, + }; + int r; + + if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) { + return -EINVAL; + } + + r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed"); + return r; + } + + return 0; +} + +static void vhost_virtqueue_error_notifier(EventNotifier *n) +{ + struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue, + error_notifier); + struct vhost_dev *dev = vq->dev; + int index = vq - dev->vqs; + + if (event_notifier_test_and_clear(n) && dev->vdev) { + VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d", + dev->vq_index + index); + } +} + +static int vhost_virtqueue_init(struct vhost_dev *dev, + struct vhost_virtqueue *vq, int n) +{ + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); + struct vhost_vring_file file = { + .index = vhost_vq_index, + }; + int r = event_notifier_init(&vq->masked_notifier, 0); + if (r < 0) { + return r; + } + + file.fd = event_notifier_get_wfd(&vq->masked_notifier); + r = dev->vhost_ops->vhost_set_vring_call(dev, &file); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); + goto fail_call; + } + + vq->dev = dev; + + if (dev->vhost_ops->vhost_set_vring_err) { + r = event_notifier_init(&vq->error_notifier, 0); + if (r < 0) { + goto fail_call; + } + + file.fd = event_notifier_get_fd(&vq->error_notifier); + r = dev->vhost_ops->vhost_set_vring_err(dev, &file); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed"); + goto fail_err; + } + + event_notifier_set_handler(&vq->error_notifier, + vhost_virtqueue_error_notifier); + } + + return 0; + +fail_err: + event_notifier_cleanup(&vq->error_notifier); +fail_call: + event_notifier_cleanup(&vq->masked_notifier); + return r; +} + +static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) +{ + event_notifier_cleanup(&vq->masked_notifier); + if (vq->dev->vhost_ops->vhost_set_vring_err) { + event_notifier_set_handler(&vq->error_notifier, NULL); + event_notifier_cleanup(&vq->error_notifier); + } +} + +int vhost_dev_init(struct vhost_dev *hdev, void *opaque, + VhostBackendType backend_type, uint32_t busyloop_timeout, + Error **errp) +{ + uint64_t features; + int i, r, n_initialized_vqs = 0; + + hdev->vdev = NULL; + hdev->migration_blocker = NULL; + + r = vhost_set_backend_type(hdev, backend_type); + assert(r >= 0); + + r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp); + if (r < 0) { + goto fail; + } + + r = hdev->vhost_ops->vhost_set_owner(hdev); + if (r < 0) { + error_setg_errno(errp, -r, "vhost_set_owner failed"); + goto fail; + } + + r = hdev->vhost_ops->vhost_get_features(hdev, &features); + if (r < 0) { + error_setg_errno(errp, -r, "vhost_get_features failed"); + goto fail; + } + + for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) { + r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i); + goto fail; + } + } + + if (busyloop_timeout) { + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, + busyloop_timeout); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to set busyloop timeout"); + goto fail_busyloop; + } + } + } + + hdev->features = features; + + hdev->memory_listener = (MemoryListener) { + .name = "vhost", + .begin = vhost_begin, + .commit = vhost_commit, + .region_add = vhost_region_addnop, + .region_nop = vhost_region_addnop, + .log_start = vhost_log_start, + .log_stop = vhost_log_stop, + .log_sync = vhost_log_sync, + .log_global_start = vhost_log_global_start, + .log_global_stop = vhost_log_global_stop, + .eventfd_add = vhost_eventfd_add, + .eventfd_del = vhost_eventfd_del, + .priority = 10 + }; + + hdev->iommu_listener = (MemoryListener) { + .name = "vhost-iommu", + .region_add = vhost_iommu_region_add, + .region_del = vhost_iommu_region_del, + }; + + if (hdev->migration_blocker == NULL) { + if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + error_setg(&hdev->migration_blocker, + "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); + } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) { + error_setg(&hdev->migration_blocker, + "Migration disabled: failed to allocate shared memory"); + } + } + + if (hdev->migration_blocker != NULL) { + r = migrate_add_blocker(hdev->migration_blocker, errp); + if (r < 0) { + error_free(hdev->migration_blocker); + goto fail_busyloop; + } + } + + hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); + hdev->n_mem_sections = 0; + hdev->mem_sections = NULL; + hdev->log = NULL; + hdev->log_size = 0; + hdev->log_enabled = false; + hdev->started = false; + memory_listener_register(&hdev->memory_listener, &address_space_memory); + QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); + + if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { + error_setg(errp, "vhost backend memory slots limit is less" + " than current number of present memory slots"); + r = -EINVAL; + goto fail_busyloop; + } + + return 0; + +fail_busyloop: + if (busyloop_timeout) { + while (--i >= 0) { + vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0); + } + } +fail: + hdev->nvqs = n_initialized_vqs; + vhost_dev_cleanup(hdev); + return r; +} + +void vhost_dev_cleanup(struct vhost_dev *hdev) +{ + int i; + + trace_vhost_dev_cleanup(hdev); + + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_cleanup(hdev->vqs + i); + } + if (hdev->mem) { + /* those are only safe after successful init */ + memory_listener_unregister(&hdev->memory_listener); + QLIST_REMOVE(hdev, entry); + } + if (hdev->migration_blocker) { + migrate_del_blocker(hdev->migration_blocker); + error_free(hdev->migration_blocker); + } + g_free(hdev->mem); + g_free(hdev->mem_sections); + if (hdev->vhost_ops) { + hdev->vhost_ops->vhost_backend_cleanup(hdev); + } + assert(!hdev->log); + + memset(hdev, 0, sizeof(struct vhost_dev)); +} + +/* Stop processing guest IO notifications in qemu. + * Start processing them in vhost in kernel. + */ +int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + int i, r, e; + + /* We will pass the notifiers to the kernel, make sure that QEMU + * doesn't interfere. + */ + r = virtio_device_grab_ioeventfd(vdev); + if (r < 0) { + error_report("binding does not support host notifiers"); + goto fail; + } + + for (i = 0; i < hdev->nvqs; ++i) { + r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, + true); + if (r < 0) { + error_report("vhost VQ %d notifier binding failed: %d", i, -r); + goto fail_vq; + } + } + + return 0; +fail_vq: + while (--i >= 0) { + e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, + false); + if (e < 0) { + error_report("vhost VQ %d notifier cleanup error: %d", i, -r); + } + assert (e >= 0); + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); + } + virtio_device_release_ioeventfd(vdev); +fail: + return r; +} + +/* Stop processing guest IO notifications in vhost. + * Start processing them in qemu. + * This might actually run the qemu handlers right away, + * so virtio in qemu must be completely setup when this is called. + */ +void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + int i, r; + + for (i = 0; i < hdev->nvqs; ++i) { + r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i, + false); + if (r < 0) { + error_report("vhost VQ %d notifier cleanup failed: %d", i, -r); + } + assert (r >= 0); + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i); + } + virtio_device_release_ioeventfd(vdev); +} + +/* Test and clear event pending status. + * Should be called after unmask to avoid losing events. + */ +bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n) +{ + struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index; + assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); + return event_notifier_test_and_clear(&vq->masked_notifier); +} + +/* Mask/unmask events from this vq. */ +void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, + bool mask) +{ + struct VirtQueue *vvq = virtio_get_queue(vdev, n); + int r, index = n - hdev->vq_index; + struct vhost_vring_file file; + + /* should only be called after backend is connected */ + assert(hdev->vhost_ops); + + if (mask) { + assert(vdev->use_guest_notifier_mask); + file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); + } else { + file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); + } + + file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); + r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); + } +} + +uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, + uint64_t features) +{ + const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { + uint64_t bit_mask = (1ULL << *bit); + if (!(hdev->features & bit_mask)) { + features &= ~bit_mask; + } + bit++; + } + return features; +} + +void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, + uint64_t features) +{ + const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { + uint64_t bit_mask = (1ULL << *bit); + if (features & bit_mask) { + hdev->acked_features |= bit_mask; + } + bit++; + } +} + +int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config, + uint32_t config_len, Error **errp) +{ + assert(hdev->vhost_ops); + + if (hdev->vhost_ops->vhost_get_config) { + return hdev->vhost_ops->vhost_get_config(hdev, config, config_len, + errp); + } + + error_setg(errp, "vhost_get_config not implemented"); + return -ENOSYS; +} + +int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data, + uint32_t offset, uint32_t size, uint32_t flags) +{ + assert(hdev->vhost_ops); + + if (hdev->vhost_ops->vhost_set_config) { + return hdev->vhost_ops->vhost_set_config(hdev, data, offset, + size, flags); + } + + return -ENOSYS; +} + +void vhost_dev_set_config_notifier(struct vhost_dev *hdev, + const VhostDevConfigOps *ops) +{ + hdev->config_ops = ops; +} + +void vhost_dev_free_inflight(struct vhost_inflight *inflight) +{ + if (inflight && inflight->addr) { + qemu_memfd_free(inflight->addr, inflight->size, inflight->fd); + inflight->addr = NULL; + inflight->fd = -1; + } +} + +static int vhost_dev_resize_inflight(struct vhost_inflight *inflight, + uint64_t new_size) +{ + Error *err = NULL; + int fd = -1; + void *addr = qemu_memfd_alloc("vhost-inflight", new_size, + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, + &fd, &err); + + if (err) { + error_report_err(err); + return -ENOMEM; + } + + vhost_dev_free_inflight(inflight); + inflight->offset = 0; + inflight->addr = addr; + inflight->fd = fd; + inflight->size = new_size; + + return 0; +} + +void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f) +{ + if (inflight->addr) { + qemu_put_be64(f, inflight->size); + qemu_put_be16(f, inflight->queue_size); + qemu_put_buffer(f, inflight->addr, inflight->size); + } else { + qemu_put_be64(f, 0); + } +} + +int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f) +{ + uint64_t size; + + size = qemu_get_be64(f); + if (!size) { + return 0; + } + + if (inflight->size != size) { + int ret = vhost_dev_resize_inflight(inflight, size); + if (ret < 0) { + return ret; + } + } + inflight->queue_size = qemu_get_be16(f); + + qemu_get_buffer(f, inflight->addr, size); + + return 0; +} + +int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev) +{ + int r; + + if (hdev->vhost_ops->vhost_get_inflight_fd == NULL || + hdev->vhost_ops->vhost_set_inflight_fd == NULL) { + return 0; + } + + hdev->vdev = vdev; + + r = vhost_dev_set_features(hdev, hdev->log_enabled); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed"); + return r; + } + + return 0; +} + +int vhost_dev_set_inflight(struct vhost_dev *dev, + struct vhost_inflight *inflight) +{ + int r; + + if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) { + r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed"); + return r; + } + } + + return 0; +} + +int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size, + struct vhost_inflight *inflight) +{ + int r; + + if (dev->vhost_ops->vhost_get_inflight_fd) { + r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight); + if (r) { + VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed"); + return r; + } + } + + return 0; +} + +static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable) +{ + if (!hdev->vhost_ops->vhost_set_vring_enable) { + return 0; + } + + /* + * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not + * been negotiated, the rings start directly in the enabled state, and + * .vhost_set_vring_enable callback will fail since + * VHOST_USER_SET_VRING_ENABLE is not supported. + */ + if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER && + !virtio_has_feature(hdev->backend_features, + VHOST_USER_F_PROTOCOL_FEATURES)) { + return 0; + } + + return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable); +} + +/* Host notifiers must be enabled at this point. */ +int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i, r; + + /* should only be called after backend is connected */ + assert(hdev->vhost_ops); + + trace_vhost_dev_start(hdev, vdev->name, vrings); + + vdev->vhost_started = true; + hdev->started = true; + hdev->vdev = vdev; + + r = vhost_dev_set_features(hdev, hdev->log_enabled); + if (r < 0) { + goto fail_features; + } + + if (vhost_dev_has_iommu(hdev)) { + memory_listener_register(&hdev->iommu_listener, vdev->dma_as); + } + + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed"); + goto fail_mem; + } + for (i = 0; i < hdev->nvqs; ++i) { + r = vhost_virtqueue_start(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + if (r < 0) { + goto fail_vq; + } + } + + if (hdev->log_enabled) { + uint64_t log_base; + + hdev->log_size = vhost_get_log_size(hdev); + hdev->log = vhost_log_get(hdev->log_size, + vhost_dev_log_is_shared(hdev)); + log_base = (uintptr_t)hdev->log->log; + r = hdev->vhost_ops->vhost_set_log_base(hdev, + hdev->log_size ? log_base : 0, + hdev->log); + if (r < 0) { + VHOST_OPS_DEBUG(r, "vhost_set_log_base failed"); + goto fail_log; + } + } + if (vrings) { + r = vhost_dev_set_vring_enable(hdev, true); + if (r) { + goto fail_log; + } + } + if (hdev->vhost_ops->vhost_dev_start) { + r = hdev->vhost_ops->vhost_dev_start(hdev, true); + if (r) { + goto fail_start; + } + } + if (vhost_dev_has_iommu(hdev) && + hdev->vhost_ops->vhost_set_iotlb_callback) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true); + + /* Update used ring information for IOTLB to work correctly, + * vhost-kernel code requires for this.*/ + for (i = 0; i < hdev->nvqs; ++i) { + struct vhost_virtqueue *vq = hdev->vqs + i; + vhost_device_iotlb_miss(hdev, vq->used_phys, true); + } + } + return 0; +fail_start: + if (vrings) { + vhost_dev_set_vring_enable(hdev, false); + } +fail_log: + vhost_log_put(hdev, false); +fail_vq: + while (--i >= 0) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + +fail_mem: +fail_features: + vdev->vhost_started = false; + hdev->started = false; + return r; +} + +/* Host notifiers must be enabled at this point. */ +void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings) +{ + int i; + + /* should only be called after backend is connected */ + assert(hdev->vhost_ops); + + trace_vhost_dev_stop(hdev, vdev->name, vrings); + + if (hdev->vhost_ops->vhost_dev_start) { + hdev->vhost_ops->vhost_dev_start(hdev, false); + } + if (vrings) { + vhost_dev_set_vring_enable(hdev, false); + } + for (i = 0; i < hdev->nvqs; ++i) { + vhost_virtqueue_stop(hdev, + vdev, + hdev->vqs + i, + hdev->vq_index + i); + } + + if (vhost_dev_has_iommu(hdev)) { + if (hdev->vhost_ops->vhost_set_iotlb_callback) { + hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false); + } + memory_listener_unregister(&hdev->iommu_listener); + } + vhost_log_put(hdev, true); + hdev->started = false; + vdev->vhost_started = false; + hdev->vdev = NULL; +} + +int vhost_net_set_backend(struct vhost_dev *hdev, + struct vhost_vring_file *file) +{ + if (hdev->vhost_ops->vhost_net_set_backend) { + return hdev->vhost_ops->vhost_net_set_backend(hdev, file); + } + + return -ENOSYS; +} diff --git a/hw/virtio/virtio-9p-pci.c b/hw/virtio/virtio-9p-pci.c new file mode 100644 index 00000000..94c14f0b --- /dev/null +++ b/hw/virtio/virtio-9p-pci.c @@ -0,0 +1,91 @@ +/* + * Virtio 9p PCI Bindings + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/9pfs/virtio-9p.h" +#include "hw/qdev-properties.h" +#include "qemu/module.h" +#include "qom/object.h" + +/* + * virtio-9p-pci: This extends VirtioPCIProxy. + */ + +#define TYPE_VIRTIO_9P_PCI "virtio-9p-pci-base" +typedef struct V9fsPCIState V9fsPCIState; +DECLARE_INSTANCE_CHECKER(V9fsPCIState, VIRTIO_9P_PCI, + TYPE_VIRTIO_9P_PCI) + +struct V9fsPCIState { + VirtIOPCIProxy parent_obj; + V9fsVirtioState vdev; +}; + +static void virtio_9p_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + V9fsPCIState *dev = VIRTIO_9P_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static Property virtio_9p_pci_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_9p_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + + k->realize = virtio_9p_pci_realize; + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_9P; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = 0x2; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, virtio_9p_pci_properties); +} + +static void virtio_9p_pci_instance_init(Object *obj) +{ + V9fsPCIState *dev = VIRTIO_9P_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_9P); +} + +static const VirtioPCIDeviceTypeInfo virtio_9p_pci_info = { + .base_name = TYPE_VIRTIO_9P_PCI, + .generic_name = "virtio-9p-pci", + .transitional_name = "virtio-9p-pci-transitional", + .non_transitional_name = "virtio-9p-pci-non-transitional", + .instance_size = sizeof(V9fsPCIState), + .instance_init = virtio_9p_pci_instance_init, + .class_init = virtio_9p_pci_class_init, +}; + +static void virtio_9p_pci_register(void) +{ + virtio_pci_types_register(&virtio_9p_pci_info); +} + +type_init(virtio_9p_pci_register) diff --git a/hw/virtio/virtio-balloon-pci.c b/hw/virtio/virtio-balloon-pci.c new file mode 100644 index 00000000..ce2645ba --- /dev/null +++ b/hw/virtio/virtio-balloon-pci.c @@ -0,0 +1,88 @@ +/* + * Virtio balloon PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-balloon.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIOBalloonPCI VirtIOBalloonPCI; + +/* + * virtio-balloon-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_BALLOON_PCI "virtio-balloon-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOBalloonPCI, VIRTIO_BALLOON_PCI, + TYPE_VIRTIO_BALLOON_PCI) + +struct VirtIOBalloonPCI { + VirtIOPCIProxy parent_obj; + VirtIOBalloon vdev; +}; + +static void virtio_balloon_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOBalloonPCI *dev = VIRTIO_BALLOON_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + + vpci_dev->class_code = PCI_CLASS_OTHERS; + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_balloon_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = virtio_balloon_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BALLOON; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; +} + +static void virtio_balloon_pci_instance_init(Object *obj) +{ + VirtIOBalloonPCI *dev = VIRTIO_BALLOON_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_BALLOON); + object_property_add_alias(obj, "guest-stats", OBJECT(&dev->vdev), + "guest-stats"); + object_property_add_alias(obj, "guest-stats-polling-interval", + OBJECT(&dev->vdev), + "guest-stats-polling-interval"); +} + +static const VirtioPCIDeviceTypeInfo virtio_balloon_pci_info = { + .base_name = TYPE_VIRTIO_BALLOON_PCI, + .generic_name = "virtio-balloon-pci", + .transitional_name = "virtio-balloon-pci-transitional", + .non_transitional_name = "virtio-balloon-pci-non-transitional", + .instance_size = sizeof(VirtIOBalloonPCI), + .instance_init = virtio_balloon_pci_instance_init, + .class_init = virtio_balloon_pci_class_init, +}; + +static void virtio_balloon_pci_register(void) +{ + virtio_pci_types_register(&virtio_balloon_pci_info); +} + +type_init(virtio_balloon_pci_register) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c new file mode 100644 index 00000000..73ac5eb6 --- /dev/null +++ b/hw/virtio/virtio-balloon.c @@ -0,0 +1,1079 @@ +/* + * Virtio Balloon Device + * + * Copyright IBM, Corp. 2008 + * Copyright (C) 2011 Red Hat, Inc. + * Copyright (C) 2011 Amit Shah <amit.shah@redhat.com> + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qemu/module.h" +#include "qemu/timer.h" +#include "qemu/madvise.h" +#include "hw/virtio/virtio.h" +#include "hw/mem/pc-dimm.h" +#include "hw/qdev-properties.h" +#include "hw/boards.h" +#include "sysemu/balloon.h" +#include "hw/virtio/virtio-balloon.h" +#include "exec/address-spaces.h" +#include "qapi/error.h" +#include "qapi/qapi-events-machine.h" +#include "qapi/visitor.h" +#include "trace.h" +#include "qemu/error-report.h" +#include "migration/misc.h" +#include "migration/migration.h" + +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" + +#define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) + +typedef struct PartiallyBalloonedPage { + ram_addr_t base_gpa; + unsigned long *bitmap; +} PartiallyBalloonedPage; + +static void virtio_balloon_pbp_free(PartiallyBalloonedPage *pbp) +{ + if (!pbp->bitmap) { + return; + } + g_free(pbp->bitmap); + pbp->bitmap = NULL; +} + +static void virtio_balloon_pbp_alloc(PartiallyBalloonedPage *pbp, + ram_addr_t base_gpa, + long subpages) +{ + pbp->base_gpa = base_gpa; + pbp->bitmap = bitmap_new(subpages); +} + +static bool virtio_balloon_pbp_matches(PartiallyBalloonedPage *pbp, + ram_addr_t base_gpa) +{ + return pbp->base_gpa == base_gpa; +} + +static bool virtio_balloon_inhibited(void) +{ + /* + * Postcopy cannot deal with concurrent discards, + * so it's special, as well as background snapshots. + */ + return ram_block_discard_is_disabled() || migration_in_incoming_postcopy() || + migration_in_bg_snapshot(); +} + +static void balloon_inflate_page(VirtIOBalloon *balloon, + MemoryRegion *mr, hwaddr mr_offset, + PartiallyBalloonedPage *pbp) +{ + void *addr = memory_region_get_ram_ptr(mr) + mr_offset; + ram_addr_t rb_offset, rb_aligned_offset, base_gpa; + RAMBlock *rb; + size_t rb_page_size; + int subpages; + + /* XXX is there a better way to get to the RAMBlock than via a + * host address? */ + rb = qemu_ram_block_from_host(addr, false, &rb_offset); + rb_page_size = qemu_ram_pagesize(rb); + + if (rb_page_size == BALLOON_PAGE_SIZE) { + /* Easy case */ + + ram_block_discard_range(rb, rb_offset, rb_page_size); + /* We ignore errors from ram_block_discard_range(), because it + * has already reported them, and failing to discard a balloon + * page is not fatal */ + return; + } + + /* Hard case + * + * We've put a piece of a larger host page into the balloon - we + * need to keep track until we have a whole host page to + * discard + */ + warn_report_once( +"Balloon used with backing page size > 4kiB, this may not be reliable"); + + rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size); + subpages = rb_page_size / BALLOON_PAGE_SIZE; + base_gpa = memory_region_get_ram_addr(mr) + mr_offset - + (rb_offset - rb_aligned_offset); + + if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) { + /* We've partially ballooned part of a host page, but now + * we're trying to balloon part of a different one. Too hard, + * give up on the old partial page */ + virtio_balloon_pbp_free(pbp); + } + + if (!pbp->bitmap) { + virtio_balloon_pbp_alloc(pbp, base_gpa, subpages); + } + + set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE, + pbp->bitmap); + + if (bitmap_full(pbp->bitmap, subpages)) { + /* We've accumulated a full host page, we can actually discard + * it now */ + + ram_block_discard_range(rb, rb_aligned_offset, rb_page_size); + /* We ignore errors from ram_block_discard_range(), because it + * has already reported them, and failing to discard a balloon + * page is not fatal */ + virtio_balloon_pbp_free(pbp); + } +} + +static void balloon_deflate_page(VirtIOBalloon *balloon, + MemoryRegion *mr, hwaddr mr_offset) +{ + void *addr = memory_region_get_ram_ptr(mr) + mr_offset; + ram_addr_t rb_offset; + RAMBlock *rb; + size_t rb_page_size; + void *host_addr; + int ret; + + /* XXX is there a better way to get to the RAMBlock than via a + * host address? */ + rb = qemu_ram_block_from_host(addr, false, &rb_offset); + rb_page_size = qemu_ram_pagesize(rb); + + host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1)); + + /* When a page is deflated, we hint the whole host page it lives + * on, since we can't do anything smaller */ + ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED); + if (ret != 0) { + warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s", + strerror(errno)); + /* Otherwise ignore, failing to page hint shouldn't be fatal */ + } +} + +static const char *balloon_stat_names[] = { + [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", + [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", + [VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults", + [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults", + [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory", + [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", + [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", + [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches", + [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc", + [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail", + [VIRTIO_BALLOON_S_NR] = NULL +}; + +/* + * reset_stats - Mark all items in the stats array as unset + * + * This function needs to be called at device initialization and before + * updating to a set of newly-generated stats. This will ensure that no + * stale values stick around in case the guest reports a subset of the supported + * statistics. + */ +static inline void reset_stats(VirtIOBalloon *dev) +{ + int i; + for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1); +} + +static bool balloon_stats_supported(const VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); +} + +static bool balloon_stats_enabled(const VirtIOBalloon *s) +{ + return s->stats_poll_interval > 0; +} + +static void balloon_stats_destroy_timer(VirtIOBalloon *s) +{ + if (balloon_stats_enabled(s)) { + timer_free(s->stats_timer); + s->stats_timer = NULL; + s->stats_poll_interval = 0; + } +} + +static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs) +{ + timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000); +} + +static void balloon_stats_poll_cb(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) { + /* re-schedule */ + balloon_stats_change_timer(s, s->stats_poll_interval); + return; + } + + virtqueue_push(s->svq, s->stats_vq_elem, 0); + virtio_notify(vdev, s->svq); + g_free(s->stats_vq_elem); + s->stats_vq_elem = NULL; +} + +static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + Error *err = NULL; + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + int i; + + if (!visit_start_struct(v, name, NULL, 0, &err)) { + goto out; + } + if (!visit_type_int(v, "last-update", &s->stats_last_update, &err)) { + goto out_end; + } + + if (!visit_start_struct(v, "stats", NULL, 0, &err)) { + goto out_end; + } + for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { + if (!visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err)) { + goto out_nested; + } + } + visit_check_struct(v, &err); +out_nested: + visit_end_struct(v, NULL); + + if (!err) { + visit_check_struct(v, &err); + } +out_end: + visit_end_struct(v, NULL); +out: + error_propagate(errp, err); +} + +static void balloon_stats_get_poll_interval(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + visit_type_int(v, name, &s->stats_poll_interval, errp); +} + +static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + int64_t value; + + if (!visit_type_int(v, name, &value, errp)) { + return; + } + + if (value < 0) { + error_setg(errp, "timer value must be greater than zero"); + return; + } + + if (value > UINT32_MAX) { + error_setg(errp, "timer value is too big"); + return; + } + + if (value == s->stats_poll_interval) { + return; + } + + if (value == 0) { + /* timer=0 disables the timer */ + balloon_stats_destroy_timer(s); + return; + } + + if (balloon_stats_enabled(s)) { + /* timer interval change */ + s->stats_poll_interval = value; + balloon_stats_change_timer(s, value); + return; + } + + /* create a new timer */ + g_assert(s->stats_timer == NULL); + s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s); + s->stats_poll_interval = value; + balloon_stats_change_timer(s, 0); +} + +static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + VirtQueueElement *elem; + + while ((elem = virtqueue_pop(vq, sizeof(VirtQueueElement)))) { + unsigned int i; + + /* + * When we discard the page it has the effect of removing the page + * from the hypervisor itself and causing it to be zeroed when it + * is returned to us. So we must not discard the page if it is + * accessible by another device or process, or if the guest is + * expecting it to retain a non-zero value. + */ + if (virtio_balloon_inhibited() || dev->poison_val) { + goto skip_element; + } + + for (i = 0; i < elem->in_num; i++) { + void *addr = elem->in_sg[i].iov_base; + size_t size = elem->in_sg[i].iov_len; + ram_addr_t ram_offset; + RAMBlock *rb; + + /* + * There is no need to check the memory section to see if + * it is ram/readonly/romd like there is for handle_output + * below. If the region is not meant to be written to then + * address_space_map will have allocated a bounce buffer + * and it will be freed in address_space_unmap and trigger + * and unassigned_mem_write before failing to copy over the + * buffer. If more than one bad descriptor is provided it + * will return NULL after the first bounce buffer and fail + * to map any resources. + */ + rb = qemu_ram_block_from_host(addr, false, &ram_offset); + if (!rb) { + trace_virtio_balloon_bad_addr(elem->in_addr[i]); + continue; + } + + /* + * For now we will simply ignore unaligned memory regions, or + * regions that overrun the end of the RAMBlock. + */ + if (!QEMU_IS_ALIGNED(ram_offset | size, qemu_ram_pagesize(rb)) || + (ram_offset + size) > qemu_ram_get_used_length(rb)) { + continue; + } + + ram_block_discard_range(rb, ram_offset, size); + } + +skip_element: + virtqueue_push(vq, elem, 0); + virtio_notify(vdev, vq); + g_free(elem); + } +} + +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + VirtQueueElement *elem; + MemoryRegionSection section; + + for (;;) { + PartiallyBalloonedPage pbp = {}; + size_t offset = 0; + uint32_t pfn; + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + break; + } + + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) { + unsigned int p = virtio_ldl_p(vdev, &pfn); + hwaddr pa; + + pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT; + offset += 4; + + section = memory_region_find(get_system_memory(), pa, + BALLOON_PAGE_SIZE); + if (!section.mr) { + trace_virtio_balloon_bad_addr(pa); + continue; + } + if (!memory_region_is_ram(section.mr) || + memory_region_is_rom(section.mr) || + memory_region_is_romd(section.mr)) { + trace_virtio_balloon_bad_addr(pa); + memory_region_unref(section.mr); + continue; + } + + trace_virtio_balloon_handle_output(memory_region_name(section.mr), + pa); + if (!virtio_balloon_inhibited()) { + if (vq == s->ivq) { + balloon_inflate_page(s, section.mr, + section.offset_within_region, &pbp); + } else if (vq == s->dvq) { + balloon_deflate_page(s, section.mr, section.offset_within_region); + } else { + g_assert_not_reached(); + } + } + memory_region_unref(section.mr); + } + + virtqueue_push(vq, elem, 0); + virtio_notify(vdev, vq); + g_free(elem); + virtio_balloon_pbp_free(&pbp); + } +} + +static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + VirtQueueElement *elem; + VirtIOBalloonStat stat; + size_t offset = 0; + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + goto out; + } + + if (s->stats_vq_elem != NULL) { + /* This should never happen if the driver follows the spec. */ + virtqueue_push(vq, s->stats_vq_elem, 0); + virtio_notify(vdev, vq); + g_free(s->stats_vq_elem); + } + + s->stats_vq_elem = elem; + + /* Initialize the stats to get rid of any stale values. This is only + * needed to handle the case where a guest supports fewer stats than it + * used to (ie. it has booted into an old kernel). + */ + reset_stats(s); + + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat)) + == sizeof(stat)) { + uint16_t tag = virtio_tswap16(vdev, stat.tag); + uint64_t val = virtio_tswap64(vdev, stat.val); + + offset += sizeof(stat); + if (tag < VIRTIO_BALLOON_S_NR) + s->stats[tag] = val; + } + s->stats_vq_offset = offset; + s->stats_last_update = g_get_real_time() / G_USEC_PER_SEC; + +out: + if (balloon_stats_enabled(s)) { + balloon_stats_change_timer(s, s->stats_poll_interval); + } +} + +static void virtio_balloon_handle_free_page_vq(VirtIODevice *vdev, + VirtQueue *vq) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + qemu_bh_schedule(s->free_page_bh); +} + +static bool get_free_page_hints(VirtIOBalloon *dev) +{ + VirtQueueElement *elem; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtQueue *vq = dev->free_page_vq; + bool ret = true; + int i; + + while (dev->block_iothread) { + qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock); + } + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return false; + } + + if (elem->out_num) { + uint32_t id; + size_t size = iov_to_buf(elem->out_sg, elem->out_num, 0, + &id, sizeof(id)); + + virtio_tswap32s(vdev, &id); + if (unlikely(size != sizeof(id))) { + virtio_error(vdev, "received an incorrect cmd id"); + ret = false; + goto out; + } + if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED && + id == dev->free_page_hint_cmd_id) { + dev->free_page_hint_status = FREE_PAGE_HINT_S_START; + } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_START) { + /* + * Stop the optimization only when it has started. This + * avoids a stale stop sign for the previous command. + */ + dev->free_page_hint_status = FREE_PAGE_HINT_S_STOP; + } + } + + if (elem->in_num && dev->free_page_hint_status == FREE_PAGE_HINT_S_START) { + for (i = 0; i < elem->in_num; i++) { + qemu_guest_free_page_hint(elem->in_sg[i].iov_base, + elem->in_sg[i].iov_len); + } + } + +out: + virtqueue_push(vq, elem, 0); + g_free(elem); + return ret; +} + +static void virtio_ballloon_get_free_page_hints(void *opaque) +{ + VirtIOBalloon *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtQueue *vq = dev->free_page_vq; + bool continue_to_get_hints; + + do { + qemu_mutex_lock(&dev->free_page_lock); + virtio_queue_set_notification(vq, 0); + continue_to_get_hints = get_free_page_hints(dev); + qemu_mutex_unlock(&dev->free_page_lock); + virtio_notify(vdev, vq); + /* + * Start to poll the vq once the hinting started. Otherwise, continue + * only when there are entries on the vq, which need to be given back. + */ + } while (continue_to_get_hints || + dev->free_page_hint_status == FREE_PAGE_HINT_S_START); + virtio_queue_set_notification(vq, 1); +} + +static bool virtio_balloon_free_page_support(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT); +} + +static void virtio_balloon_free_page_start(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + qemu_mutex_lock(&s->free_page_lock); + + if (s->free_page_hint_cmd_id == UINT_MAX) { + s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN; + } else { + s->free_page_hint_cmd_id++; + } + + s->free_page_hint_status = FREE_PAGE_HINT_S_REQUESTED; + qemu_mutex_unlock(&s->free_page_lock); + + virtio_notify_config(vdev); +} + +static void virtio_balloon_free_page_stop(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (s->free_page_hint_status != FREE_PAGE_HINT_S_STOP) { + /* + * The lock also guarantees us that the + * virtio_ballloon_get_free_page_hints exits after the + * free_page_hint_status is set to S_STOP. + */ + qemu_mutex_lock(&s->free_page_lock); + /* + * The guest isn't done hinting, so send a notification + * to the guest to actively stop the hinting. + */ + s->free_page_hint_status = FREE_PAGE_HINT_S_STOP; + qemu_mutex_unlock(&s->free_page_lock); + virtio_notify_config(vdev); + } +} + +static void virtio_balloon_free_page_done(VirtIOBalloon *s) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + if (s->free_page_hint_status != FREE_PAGE_HINT_S_DONE) { + /* See virtio_balloon_free_page_stop() */ + qemu_mutex_lock(&s->free_page_lock); + s->free_page_hint_status = FREE_PAGE_HINT_S_DONE; + qemu_mutex_unlock(&s->free_page_lock); + virtio_notify_config(vdev); + } +} + +static int +virtio_balloon_free_page_hint_notify(NotifierWithReturn *n, void *data) +{ + VirtIOBalloon *dev = container_of(n, VirtIOBalloon, free_page_hint_notify); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + PrecopyNotifyData *pnd = data; + + if (!virtio_balloon_free_page_support(dev)) { + /* + * This is an optimization provided to migration, so just return 0 to + * have the normal migration process not affected when this feature is + * not supported. + */ + return 0; + } + + /* + * Pages hinted via qemu_guest_free_page_hint() are cleared from the dirty + * bitmap and will not get migrated, especially also not when the postcopy + * destination starts using them and requests migration from the source; the + * faulting thread will stall until postcopy migration finishes and + * all threads are woken up. Let's not start free page hinting if postcopy + * is possible. + */ + if (migrate_postcopy_ram()) { + return 0; + } + + switch (pnd->reason) { + case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC: + virtio_balloon_free_page_stop(dev); + break; + case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC: + if (vdev->vm_running) { + virtio_balloon_free_page_start(dev); + break; + } + /* + * Set S_DONE before migrating the vmstate, so the guest will reuse + * all hinted pages once running on the destination. Fall through. + */ + case PRECOPY_NOTIFY_CLEANUP: + /* + * Especially, if something goes wrong during precopy or if migration + * is canceled, we have to properly communicate S_DONE to the VM. + */ + virtio_balloon_free_page_done(dev); + break; + case PRECOPY_NOTIFY_SETUP: + case PRECOPY_NOTIFY_COMPLETE: + break; + default: + virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason); + } + + return 0; +} + +static size_t virtio_balloon_config_size(VirtIOBalloon *s) +{ + uint64_t features = s->host_features; + + if (s->qemu_4_0_config_size) { + return sizeof(struct virtio_balloon_config); + } + if (virtio_has_feature(features, VIRTIO_BALLOON_F_PAGE_POISON)) { + return sizeof(struct virtio_balloon_config); + } + if (virtio_has_feature(features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + return offsetof(struct virtio_balloon_config, poison_val); + } + return offsetof(struct virtio_balloon_config, free_page_hint_cmd_id); +} + +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + struct virtio_balloon_config config = {}; + + config.num_pages = cpu_to_le32(dev->num_pages); + config.actual = cpu_to_le32(dev->actual); + config.poison_val = cpu_to_le32(dev->poison_val); + + if (dev->free_page_hint_status == FREE_PAGE_HINT_S_REQUESTED) { + config.free_page_hint_cmd_id = + cpu_to_le32(dev->free_page_hint_cmd_id); + } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_STOP) { + config.free_page_hint_cmd_id = + cpu_to_le32(VIRTIO_BALLOON_CMD_ID_STOP); + } else if (dev->free_page_hint_status == FREE_PAGE_HINT_S_DONE) { + config.free_page_hint_cmd_id = + cpu_to_le32(VIRTIO_BALLOON_CMD_ID_DONE); + } + + trace_virtio_balloon_get_config(config.num_pages, config.actual); + memcpy(config_data, &config, virtio_balloon_config_size(dev)); +} + +static int build_dimm_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_prepend(*list, dev); + } + } + + object_child_foreach(obj, build_dimm_list, opaque); + return 0; +} + +static ram_addr_t get_current_ram_size(void) +{ + GSList *list = NULL, *item; + ram_addr_t size = current_machine->ram_size; + + build_dimm_list(qdev_get_machine(), &list); + for (item = list; item; item = g_slist_next(item)) { + Object *obj = OBJECT(item->data); + if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) { + size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, + &error_abort); + } + } + g_slist_free(list); + + return size; +} + +static bool virtio_balloon_page_poison_support(void *opaque) +{ + VirtIOBalloon *s = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(s); + + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON); +} + +static void virtio_balloon_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + struct virtio_balloon_config config; + uint32_t oldactual = dev->actual; + ram_addr_t vm_ram_size = get_current_ram_size(); + + memcpy(&config, config_data, virtio_balloon_config_size(dev)); + dev->actual = le32_to_cpu(config.actual); + if (dev->actual != oldactual) { + qapi_event_send_balloon_change(vm_ram_size - + ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT)); + } + dev->poison_val = 0; + if (virtio_balloon_page_poison_support(dev)) { + dev->poison_val = le32_to_cpu(config.poison_val); + } + trace_virtio_balloon_set_config(dev->actual, oldactual); +} + +static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, + Error **errp) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); + f |= dev->host_features; + virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ); + + return f; +} + +static void virtio_balloon_stat(void *opaque, BalloonInfo *info) +{ + VirtIOBalloon *dev = opaque; + info->actual = get_current_ram_size() - ((uint64_t) dev->actual << + VIRTIO_BALLOON_PFN_SHIFT); +} + +static void virtio_balloon_to_target(void *opaque, ram_addr_t target) +{ + VirtIOBalloon *dev = VIRTIO_BALLOON(opaque); + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + ram_addr_t vm_ram_size = get_current_ram_size(); + + if (target > vm_ram_size) { + target = vm_ram_size; + } + if (target) { + dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT; + virtio_notify_config(vdev); + } + trace_virtio_balloon_to_target(target, dev->num_pages); +} + +static int virtio_balloon_post_load_device(void *opaque, int version_id) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(opaque); + + if (balloon_stats_enabled(s)) { + balloon_stats_change_timer(s, s->stats_poll_interval); + } + return 0; +} + +static const VMStateDescription vmstate_virtio_balloon_free_page_hint = { + .name = "virtio-balloon-device/free-page-report", + .version_id = 1, + .minimum_version_id = 1, + .needed = virtio_balloon_free_page_support, + .fields = (VMStateField[]) { + VMSTATE_UINT32(free_page_hint_cmd_id, VirtIOBalloon), + VMSTATE_UINT32(free_page_hint_status, VirtIOBalloon), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_balloon_page_poison = { + .name = "virtio-balloon-device/page-poison", + .version_id = 1, + .minimum_version_id = 1, + .needed = virtio_balloon_page_poison_support, + .fields = (VMStateField[]) { + VMSTATE_UINT32(poison_val, VirtIOBalloon), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_balloon_device = { + .name = "virtio-balloon-device", + .version_id = 1, + .minimum_version_id = 1, + .post_load = virtio_balloon_post_load_device, + .fields = (VMStateField[]) { + VMSTATE_UINT32(num_pages, VirtIOBalloon), + VMSTATE_UINT32(actual, VirtIOBalloon), + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_balloon_free_page_hint, + &vmstate_virtio_balloon_page_poison, + NULL + } +}; + +static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOBalloon *s = VIRTIO_BALLOON(dev); + int ret; + + virtio_init(vdev, VIRTIO_ID_BALLOON, virtio_balloon_config_size(s)); + + ret = qemu_add_balloon_handler(virtio_balloon_to_target, + virtio_balloon_stat, s); + + if (ret < 0) { + error_setg(errp, "Only one balloon device is supported"); + virtio_cleanup(vdev); + return; + } + + if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT) && + !s->iothread) { + error_setg(errp, "'free-page-hint' requires 'iothread' to be set"); + virtio_cleanup(vdev); + return; + } + + s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); + s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output); + s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats); + + if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, + virtio_balloon_handle_free_page_vq); + precopy_add_notifier(&s->free_page_hint_notify); + + object_ref(OBJECT(s->iothread)); + s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread), + virtio_ballloon_get_free_page_hints, s); + } + + if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_REPORTING)) { + s->reporting_vq = virtio_add_queue(vdev, 32, + virtio_balloon_handle_report); + } + + reset_stats(s); +} + +static void virtio_balloon_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOBalloon *s = VIRTIO_BALLOON(dev); + + if (s->free_page_bh) { + qemu_bh_delete(s->free_page_bh); + object_unref(OBJECT(s->iothread)); + virtio_balloon_free_page_stop(s); + precopy_remove_notifier(&s->free_page_hint_notify); + } + balloon_stats_destroy_timer(s); + qemu_remove_balloon_handler(s); + + virtio_delete_queue(s->ivq); + virtio_delete_queue(s->dvq); + virtio_delete_queue(s->svq); + if (s->free_page_vq) { + virtio_delete_queue(s->free_page_vq); + } + if (s->reporting_vq) { + virtio_delete_queue(s->reporting_vq); + } + virtio_cleanup(vdev); +} + +static void virtio_balloon_device_reset(VirtIODevice *vdev) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + + if (virtio_balloon_free_page_support(s)) { + virtio_balloon_free_page_stop(s); + } + + if (s->stats_vq_elem != NULL) { + virtqueue_unpop(s->svq, s->stats_vq_elem, 0); + g_free(s->stats_vq_elem); + s->stats_vq_elem = NULL; + } + + s->poison_val = 0; +} + +static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + + if (!s->stats_vq_elem && vdev->vm_running && + (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) { + /* poll stats queue for the element we have discarded when the VM + * was stopped */ + virtio_balloon_receive_stats(vdev, s->svq); + } + + if (virtio_balloon_free_page_support(s)) { + /* + * The VM is woken up and the iothread was blocked, so signal it to + * continue. + */ + if (vdev->vm_running && s->block_iothread) { + qemu_mutex_lock(&s->free_page_lock); + s->block_iothread = false; + qemu_cond_signal(&s->free_page_cond); + qemu_mutex_unlock(&s->free_page_lock); + } + + /* The VM is stopped, block the iothread. */ + if (!vdev->vm_running) { + qemu_mutex_lock(&s->free_page_lock); + s->block_iothread = true; + qemu_mutex_unlock(&s->free_page_lock); + } + } +} + +static void virtio_balloon_instance_init(Object *obj) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(obj); + + qemu_mutex_init(&s->free_page_lock); + qemu_cond_init(&s->free_page_cond); + s->free_page_hint_cmd_id = VIRTIO_BALLOON_FREE_PAGE_HINT_CMD_ID_MIN; + s->free_page_hint_notify.notify = virtio_balloon_free_page_hint_notify; + + object_property_add(obj, "guest-stats", "guest statistics", + balloon_stats_get_all, NULL, NULL, NULL); + + object_property_add(obj, "guest-stats-polling-interval", "int", + balloon_stats_get_poll_interval, + balloon_stats_set_poll_interval, + NULL, NULL); +} + +static const VMStateDescription vmstate_virtio_balloon = { + .name = "virtio-balloon", + .minimum_version_id = 1, + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static Property virtio_balloon_properties[] = { + DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false), + DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_FREE_PAGE_HINT, false), + DEFINE_PROP_BIT("page-poison", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_PAGE_POISON, true), + DEFINE_PROP_BIT("free-page-reporting", VirtIOBalloon, host_features, + VIRTIO_BALLOON_F_REPORTING, false), + /* QEMU 4.0 accidentally changed the config size even when free-page-hint + * is disabled, resulting in QEMU 3.1 migration incompatibility. This + * property retains this quirk for QEMU 4.1 machine types. + */ + DEFINE_PROP_BOOL("qemu-4-0-config-size", VirtIOBalloon, + qemu_4_0_config_size, false), + DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD, + IOThread *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_balloon_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_balloon_properties); + dc->vmsd = &vmstate_virtio_balloon; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_balloon_device_realize; + vdc->unrealize = virtio_balloon_device_unrealize; + vdc->reset = virtio_balloon_device_reset; + vdc->get_config = virtio_balloon_get_config; + vdc->set_config = virtio_balloon_set_config; + vdc->get_features = virtio_balloon_get_features; + vdc->set_status = virtio_balloon_set_status; + vdc->vmsd = &vmstate_virtio_balloon_device; +} + +static const TypeInfo virtio_balloon_info = { + .name = TYPE_VIRTIO_BALLOON, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOBalloon), + .instance_init = virtio_balloon_instance_init, + .class_init = virtio_balloon_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_balloon_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-blk-pci.c b/hw/virtio/virtio-blk-pci.c new file mode 100644 index 00000000..9743bee9 --- /dev/null +++ b/hw/virtio/virtio-blk-pci.c @@ -0,0 +1,107 @@ +/* + * Virtio blk PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-blk.h" +#include "hw/virtio/virtio-pci.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIOBlkPCI VirtIOBlkPCI; + +/* + * virtio-blk-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_BLK_PCI "virtio-blk-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOBlkPCI, VIRTIO_BLK_PCI, + TYPE_VIRTIO_BLK_PCI) + +struct VirtIOBlkPCI { + VirtIOPCIProxy parent_obj; + VirtIOBlock vdev; +}; + +static Property virtio_blk_pci_properties[] = { + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIOBlkConf *conf = &dev->vdev.conf; + + if (conf->num_queues == VIRTIO_BLK_AUTO_NUM_QUEUES) { + conf->num_queues = virtio_pci_optimal_num_queues(0); + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = conf->num_queues + 1; + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_blk_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, virtio_blk_pci_properties); + k->realize = virtio_blk_pci_realize; + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; +} + +static void virtio_blk_pci_instance_init(Object *obj) +{ + VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_BLK); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo virtio_blk_pci_info = { + .base_name = TYPE_VIRTIO_BLK_PCI, + .generic_name = "virtio-blk-pci", + .transitional_name = "virtio-blk-pci-transitional", + .non_transitional_name = "virtio-blk-pci-non-transitional", + .instance_size = sizeof(VirtIOBlkPCI), + .instance_init = virtio_blk_pci_instance_init, + .class_init = virtio_blk_pci_class_init, +}; + +static void virtio_blk_pci_register(void) +{ + virtio_pci_types_register(&virtio_blk_pci_info); +} + +type_init(virtio_blk_pci_register) diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c new file mode 100644 index 00000000..896feb37 --- /dev/null +++ b/hw/virtio/virtio-bus.c @@ -0,0 +1,372 @@ +/* + * VirtioBus + * + * Copyright (C) 2012 : GreenSocs Ltd + * http://www.greensocs.com/ , email: info@greensocs.com + * + * Developed by : + * Frederic Konrad <fred.konrad@greensocs.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "qapi/error.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio.h" +#include "exec/address-spaces.h" + +/* #define DEBUG_VIRTIO_BUS */ + +#ifdef DEBUG_VIRTIO_BUS +#define DPRINTF(fmt, ...) \ +do { printf("virtio_bus: " fmt , ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +/* A VirtIODevice is being plugged */ +void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) +{ + DeviceState *qdev = DEVICE(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(qdev)); + VirtioBusState *bus = VIRTIO_BUS(qbus); + VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(bus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + bool has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + bool vdev_has_iommu; + Error *local_err = NULL; + + DPRINTF("%s: plug device.\n", qbus->name); + + if (klass->pre_plugged != NULL) { + klass->pre_plugged(qbus->parent, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + + /* Get the features of the plugged device. */ + assert(vdc->get_features != NULL); + vdev->host_features = vdc->get_features(vdev, vdev->host_features, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + if (klass->device_plugged != NULL) { + klass->device_plugged(qbus->parent, &local_err); + } + if (local_err) { + error_propagate(errp, local_err); + return; + } + + vdev->dma_as = &address_space_memory; + if (has_iommu) { + vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + /* + * Present IOMMU_PLATFORM to the driver iff iommu_plattform=on and + * device operational. If the driver does not accept IOMMU_PLATFORM + * we fail the device. + */ + virtio_add_feature(&vdev->host_features, VIRTIO_F_IOMMU_PLATFORM); + if (klass->get_dma_as) { + vdev->dma_as = klass->get_dma_as(qbus->parent); + if (!vdev_has_iommu && vdev->dma_as != &address_space_memory) { + error_setg(errp, + "iommu_platform=true is not supported by the device"); + return; + } + } + } +} + +/* Reset the virtio_bus */ +void virtio_bus_reset(VirtioBusState *bus) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + + DPRINTF("%s: reset device.\n", BUS(bus)->name); + virtio_bus_stop_ioeventfd(bus); + if (vdev != NULL) { + virtio_reset(vdev); + } +} + +/* A VirtIODevice is being unplugged */ +void virtio_bus_device_unplugged(VirtIODevice *vdev) +{ + DeviceState *qdev = DEVICE(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(qdev)); + VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(qbus); + + DPRINTF("%s: remove device.\n", qbus->name); + + if (vdev != NULL) { + if (klass->device_unplugged != NULL) { + klass->device_unplugged(qbus->parent); + } + } +} + +/* Get the device id of the plugged device. */ +uint16_t virtio_bus_get_vdev_id(VirtioBusState *bus) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + assert(vdev != NULL); + return vdev->device_id; +} + +/* Get the config_len field of the plugged device. */ +size_t virtio_bus_get_vdev_config_len(VirtioBusState *bus) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + assert(vdev != NULL); + return vdev->config_len; +} + +/* Get bad features of the plugged device. */ +uint32_t virtio_bus_get_vdev_bad_features(VirtioBusState *bus) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtioDeviceClass *k; + + assert(vdev != NULL); + k = VIRTIO_DEVICE_GET_CLASS(vdev); + if (k->bad_features != NULL) { + return k->bad_features(vdev); + } else { + return 0; + } +} + +/* Get config of the plugged device. */ +void virtio_bus_get_vdev_config(VirtioBusState *bus, uint8_t *config) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtioDeviceClass *k; + + assert(vdev != NULL); + k = VIRTIO_DEVICE_GET_CLASS(vdev); + if (k->get_config != NULL) { + k->get_config(vdev, config); + } +} + +/* Set config of the plugged device. */ +void virtio_bus_set_vdev_config(VirtioBusState *bus, uint8_t *config) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtioDeviceClass *k; + + assert(vdev != NULL); + k = VIRTIO_DEVICE_GET_CLASS(vdev); + if (k->set_config != NULL) { + k->set_config(vdev, config); + } +} + +/* On success, ioeventfd ownership belongs to the caller. */ +int virtio_bus_grab_ioeventfd(VirtioBusState *bus) +{ + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus); + + /* vhost can be used even if ioeventfd=off in the proxy device, + * so do not check k->ioeventfd_enabled. + */ + if (!k->ioeventfd_assign) { + return -ENOSYS; + } + + if (bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) { + virtio_bus_stop_ioeventfd(bus); + /* Remember that we need to restart ioeventfd + * when ioeventfd_grabbed becomes zero. + */ + bus->ioeventfd_started = true; + } + bus->ioeventfd_grabbed++; + return 0; +} + +void virtio_bus_release_ioeventfd(VirtioBusState *bus) +{ + assert(bus->ioeventfd_grabbed != 0); + if (--bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) { + /* Force virtio_bus_start_ioeventfd to act. */ + bus->ioeventfd_started = false; + virtio_bus_start_ioeventfd(bus); + } +} + +int virtio_bus_start_ioeventfd(VirtioBusState *bus) +{ + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus); + DeviceState *proxy = DEVICE(BUS(bus)->parent); + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + int r; + + if (!k->ioeventfd_assign || !k->ioeventfd_enabled(proxy)) { + return -ENOSYS; + } + if (bus->ioeventfd_started) { + return 0; + } + + /* Only set our notifier if we have ownership. */ + if (!bus->ioeventfd_grabbed) { + r = vdc->start_ioeventfd(vdev); + if (r < 0) { + error_report("%s: failed. Fallback to userspace (slower).", __func__); + return r; + } + } + bus->ioeventfd_started = true; + return 0; +} + +void virtio_bus_stop_ioeventfd(VirtioBusState *bus) +{ + VirtIODevice *vdev; + VirtioDeviceClass *vdc; + + if (!bus->ioeventfd_started) { + return; + } + + /* Only remove our notifier if we have ownership. */ + if (!bus->ioeventfd_grabbed) { + vdev = virtio_bus_get_device(bus); + vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + vdc->stop_ioeventfd(vdev); + } + bus->ioeventfd_started = false; +} + +bool virtio_bus_ioeventfd_enabled(VirtioBusState *bus) +{ + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus); + DeviceState *proxy = DEVICE(BUS(bus)->parent); + + return k->ioeventfd_assign && k->ioeventfd_enabled(proxy); +} + +/* + * This function switches ioeventfd on/off in the device. + * The caller must set or clear the handlers for the EventNotifier. + */ +int virtio_bus_set_host_notifier(VirtioBusState *bus, int n, bool assign) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus); + DeviceState *proxy = DEVICE(BUS(bus)->parent); + VirtQueue *vq = virtio_get_queue(vdev, n); + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); + int r = 0; + + if (!k->ioeventfd_assign) { + return -ENOSYS; + } + + if (assign) { + r = event_notifier_init(notifier, 1); + if (r < 0) { + error_report("%s: unable to init event notifier: %s (%d)", + __func__, strerror(-r), r); + return r; + } + r = k->ioeventfd_assign(proxy, notifier, n, true); + if (r < 0) { + error_report("%s: unable to assign ioeventfd: %d", __func__, r); + virtio_bus_cleanup_host_notifier(bus, n); + } + } else { + k->ioeventfd_assign(proxy, notifier, n, false); + } + + if (r == 0) { + virtio_queue_set_host_notifier_enabled(vq, assign); + } + + return r; +} + +void virtio_bus_cleanup_host_notifier(VirtioBusState *bus, int n) +{ + VirtIODevice *vdev = virtio_bus_get_device(bus); + VirtQueue *vq = virtio_get_queue(vdev, n); + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); + + /* Test and clear notifier after disabling event, + * in case poll callback didn't have time to run. + */ + virtio_queue_host_notifier_read(notifier); + event_notifier_cleanup(notifier); +} + +static char *virtio_bus_get_dev_path(DeviceState *dev) +{ + BusState *bus = qdev_get_parent_bus(dev); + DeviceState *proxy = DEVICE(bus->parent); + return qdev_get_dev_path(proxy); +} + +static char *virtio_bus_get_fw_dev_path(DeviceState *dev) +{ + return NULL; +} + +bool virtio_bus_device_iommu_enabled(VirtIODevice *vdev) +{ + DeviceState *qdev = DEVICE(vdev); + BusState *qbus = BUS(qdev_get_parent_bus(qdev)); + VirtioBusState *bus = VIRTIO_BUS(qbus); + VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(bus); + + if (!klass->iommu_enabled) { + return false; + } + + return klass->iommu_enabled(qbus->parent); +} + +static void virtio_bus_class_init(ObjectClass *klass, void *data) +{ + BusClass *bus_class = BUS_CLASS(klass); + bus_class->get_dev_path = virtio_bus_get_dev_path; + bus_class->get_fw_dev_path = virtio_bus_get_fw_dev_path; +} + +static const TypeInfo virtio_bus_info = { + .name = TYPE_VIRTIO_BUS, + .parent = TYPE_BUS, + .instance_size = sizeof(VirtioBusState), + .abstract = true, + .class_size = sizeof(VirtioBusClass), + .class_init = virtio_bus_class_init +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_bus_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c new file mode 100644 index 00000000..0783dc2f --- /dev/null +++ b/hw/virtio/virtio-crypto-pci.c @@ -0,0 +1,94 @@ +/* + * Virtio crypto device + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Gonglei <arei.gonglei@huawei.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "hw/pci/pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-crypto.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIOCryptoPCI VirtIOCryptoPCI; + +/* + * virtio-crypto-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_CRYPTO_PCI "virtio-crypto-pci" +DECLARE_INSTANCE_CHECKER(VirtIOCryptoPCI, VIRTIO_CRYPTO_PCI, + TYPE_VIRTIO_CRYPTO_PCI) + +struct VirtIOCryptoPCI { + VirtIOPCIProxy parent_obj; + VirtIOCrypto vdev; +}; + +static Property virtio_crypto_pci_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_crypto_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOCryptoPCI *vcrypto = VIRTIO_CRYPTO_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&vcrypto->vdev); + + if (vcrypto->vdev.conf.cryptodev == NULL) { + error_setg(errp, "'cryptodev' parameter expects a valid object"); + return; + } + + virtio_pci_force_virtio_1(vpci_dev); + if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) { + return; + } +} + +static void virtio_crypto_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + k->realize = virtio_crypto_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, virtio_crypto_pci_properties); + pcidev_k->class_id = PCI_CLASS_OTHERS; +} + +static void virtio_crypto_initfn(Object *obj) +{ + VirtIOCryptoPCI *dev = VIRTIO_CRYPTO_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_CRYPTO); +} + +static const VirtioPCIDeviceTypeInfo virtio_crypto_pci_info = { + .generic_name = TYPE_VIRTIO_CRYPTO_PCI, + .instance_size = sizeof(VirtIOCryptoPCI), + .instance_init = virtio_crypto_initfn, + .class_init = virtio_crypto_pci_class_init, +}; + +static void virtio_crypto_pci_register_types(void) +{ + virtio_pci_types_register(&virtio_crypto_pci_info); +} +type_init(virtio_crypto_pci_register_types) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c new file mode 100644 index 00000000..97da74e7 --- /dev/null +++ b/hw/virtio/virtio-crypto.c @@ -0,0 +1,1250 @@ +/* + * Virtio crypto Support + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * + * Authors: + * Gonglei <arei.gonglei@huawei.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-crypto.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-access.h" +#include "standard-headers/linux/virtio_ids.h" +#include "sysemu/cryptodev-vhost.h" + +#define VIRTIO_CRYPTO_VM_VERSION 1 + +typedef struct VirtIOCryptoSessionReq { + VirtIODevice *vdev; + VirtQueue *vq; + VirtQueueElement *elem; + CryptoDevBackendSessionInfo info; + CryptoDevCompletionFunc cb; +} VirtIOCryptoSessionReq; + +static void virtio_crypto_free_create_session_req(VirtIOCryptoSessionReq *sreq) +{ + switch (sreq->info.op_code) { + case VIRTIO_CRYPTO_CIPHER_CREATE_SESSION: + g_free(sreq->info.u.sym_sess_info.cipher_key); + g_free(sreq->info.u.sym_sess_info.auth_key); + break; + + case VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION: + g_free(sreq->info.u.asym_sess_info.key); + break; + + case VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION: + case VIRTIO_CRYPTO_HASH_DESTROY_SESSION: + case VIRTIO_CRYPTO_MAC_DESTROY_SESSION: + case VIRTIO_CRYPTO_AEAD_DESTROY_SESSION: + case VIRTIO_CRYPTO_AKCIPHER_DESTROY_SESSION: + break; + + default: + error_report("Unknown opcode: %u", sreq->info.op_code); + } + g_free(sreq); +} + +/* + * Transfer virtqueue index to crypto queue index. + * The control virtqueue is after the data virtqueues + * so the input value doesn't need to be adjusted + */ +static inline int virtio_crypto_vq2q(int queue_index) +{ + return queue_index; +} + +static int +virtio_crypto_cipher_session_helper(VirtIODevice *vdev, + CryptoDevBackendSymSessionInfo *info, + struct virtio_crypto_cipher_session_para *cipher_para, + struct iovec **iov, unsigned int *out_num) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + unsigned int num = *out_num; + + info->cipher_alg = ldl_le_p(&cipher_para->algo); + info->key_len = ldl_le_p(&cipher_para->keylen); + info->direction = ldl_le_p(&cipher_para->op); + DPRINTF("cipher_alg=%" PRIu32 ", info->direction=%" PRIu32 "\n", + info->cipher_alg, info->direction); + + if (info->key_len > vcrypto->conf.max_cipher_key_len) { + error_report("virtio-crypto length of cipher key is too big: %u", + info->key_len); + return -VIRTIO_CRYPTO_ERR; + } + /* Get cipher key */ + if (info->key_len > 0) { + size_t s; + DPRINTF("keylen=%" PRIu32 "\n", info->key_len); + + info->cipher_key = g_malloc(info->key_len); + s = iov_to_buf(*iov, num, 0, info->cipher_key, info->key_len); + if (unlikely(s != info->key_len)) { + virtio_error(vdev, "virtio-crypto cipher key incorrect"); + return -EFAULT; + } + iov_discard_front(iov, &num, info->key_len); + *out_num = num; + } + + return 0; +} + +static int +virtio_crypto_create_sym_session(VirtIOCrypto *vcrypto, + struct virtio_crypto_sym_create_session_req *sess_req, + uint32_t queue_id, + uint32_t opcode, + struct iovec *iov, unsigned int out_num, + VirtIOCryptoSessionReq *sreq) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + CryptoDevBackendSymSessionInfo *sym_info = &sreq->info.u.sym_sess_info; + int queue_index; + uint32_t op_type; + int ret; + + op_type = ldl_le_p(&sess_req->op_type); + sreq->info.op_code = opcode; + + sym_info = &sreq->info.u.sym_sess_info; + sym_info->op_type = op_type; + + if (op_type == VIRTIO_CRYPTO_SYM_OP_CIPHER) { + ret = virtio_crypto_cipher_session_helper(vdev, sym_info, + &sess_req->u.cipher.para, + &iov, &out_num); + if (ret < 0) { + return ret; + } + } else if (op_type == VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) { + size_t s; + /* cipher part */ + ret = virtio_crypto_cipher_session_helper(vdev, sym_info, + &sess_req->u.chain.para.cipher_param, + &iov, &out_num); + if (ret < 0) { + return ret; + } + /* hash part */ + sym_info->alg_chain_order = ldl_le_p( + &sess_req->u.chain.para.alg_chain_order); + sym_info->add_len = ldl_le_p(&sess_req->u.chain.para.aad_len); + sym_info->hash_mode = ldl_le_p(&sess_req->u.chain.para.hash_mode); + if (sym_info->hash_mode == VIRTIO_CRYPTO_SYM_HASH_MODE_AUTH) { + sym_info->hash_alg = + ldl_le_p(&sess_req->u.chain.para.u.mac_param.algo); + sym_info->auth_key_len = ldl_le_p( + &sess_req->u.chain.para.u.mac_param.auth_key_len); + sym_info->hash_result_len = ldl_le_p( + &sess_req->u.chain.para.u.mac_param.hash_result_len); + if (sym_info->auth_key_len > vcrypto->conf.max_auth_key_len) { + error_report("virtio-crypto length of auth key is too big: %u", + sym_info->auth_key_len); + return -VIRTIO_CRYPTO_ERR; + } + /* get auth key */ + if (sym_info->auth_key_len > 0) { + sym_info->auth_key = g_malloc(sym_info->auth_key_len); + s = iov_to_buf(iov, out_num, 0, sym_info->auth_key, + sym_info->auth_key_len); + if (unlikely(s != sym_info->auth_key_len)) { + virtio_error(vdev, + "virtio-crypto authenticated key incorrect"); + return -EFAULT; + } + iov_discard_front(&iov, &out_num, sym_info->auth_key_len); + } + } else if (sym_info->hash_mode == VIRTIO_CRYPTO_SYM_HASH_MODE_PLAIN) { + sym_info->hash_alg = ldl_le_p( + &sess_req->u.chain.para.u.hash_param.algo); + sym_info->hash_result_len = ldl_le_p( + &sess_req->u.chain.para.u.hash_param.hash_result_len); + } else { + /* VIRTIO_CRYPTO_SYM_HASH_MODE_NESTED */ + error_report("unsupported hash mode"); + return -VIRTIO_CRYPTO_NOTSUPP; + } + } else { + /* VIRTIO_CRYPTO_SYM_OP_NONE */ + error_report("unsupported cipher op_type: VIRTIO_CRYPTO_SYM_OP_NONE"); + return -VIRTIO_CRYPTO_NOTSUPP; + } + + queue_index = virtio_crypto_vq2q(queue_id); + return cryptodev_backend_create_session(vcrypto->cryptodev, &sreq->info, + queue_index, sreq->cb, sreq); +} + +static int +virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto, + struct virtio_crypto_akcipher_create_session_req *sess_req, + uint32_t queue_id, uint32_t opcode, + struct iovec *iov, unsigned int out_num, + VirtIOCryptoSessionReq *sreq) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + CryptoDevBackendAsymSessionInfo *asym_info = &sreq->info.u.asym_sess_info; + int queue_index; + uint32_t algo, keytype, keylen; + + algo = ldl_le_p(&sess_req->para.algo); + keytype = ldl_le_p(&sess_req->para.keytype); + keylen = ldl_le_p(&sess_req->para.keylen); + + if ((keytype != VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PUBLIC) + && (keytype != VIRTIO_CRYPTO_AKCIPHER_KEY_TYPE_PRIVATE)) { + error_report("unsupported asym keytype: %d", keytype); + return -VIRTIO_CRYPTO_NOTSUPP; + } + + if (keylen) { + asym_info->key = g_malloc(keylen); + if (iov_to_buf(iov, out_num, 0, asym_info->key, keylen) != keylen) { + virtio_error(vdev, "virtio-crypto asym key incorrect"); + return -EFAULT; + } + iov_discard_front(&iov, &out_num, keylen); + } + + sreq->info.op_code = opcode; + asym_info = &sreq->info.u.asym_sess_info; + asym_info->algo = algo; + asym_info->keytype = keytype; + asym_info->keylen = keylen; + switch (asym_info->algo) { + case VIRTIO_CRYPTO_AKCIPHER_RSA: + asym_info->u.rsa.padding_algo = + ldl_le_p(&sess_req->para.u.rsa.padding_algo); + asym_info->u.rsa.hash_algo = + ldl_le_p(&sess_req->para.u.rsa.hash_algo); + break; + + /* TODO DSA&ECDSA handling */ + + default: + return -VIRTIO_CRYPTO_ERR; + } + + queue_index = virtio_crypto_vq2q(queue_id); + return cryptodev_backend_create_session(vcrypto->cryptodev, &sreq->info, + queue_index, sreq->cb, sreq); +} + +static int +virtio_crypto_handle_close_session(VirtIOCrypto *vcrypto, + struct virtio_crypto_destroy_session_req *close_sess_req, + uint32_t queue_id, + VirtIOCryptoSessionReq *sreq) +{ + uint64_t session_id; + + session_id = ldq_le_p(&close_sess_req->session_id); + DPRINTF("close session, id=%" PRIu64 "\n", session_id); + + return cryptodev_backend_close_session( + vcrypto->cryptodev, session_id, queue_id, sreq->cb, sreq); +} + +static void virtio_crypto_create_session_completion(void *opaque, int ret) +{ + VirtIOCryptoSessionReq *sreq = (VirtIOCryptoSessionReq *)opaque; + VirtQueue *vq = sreq->vq; + VirtQueueElement *elem = sreq->elem; + VirtIODevice *vdev = sreq->vdev; + struct virtio_crypto_session_input input; + struct iovec *in_iov = elem->in_sg; + unsigned in_num = elem->in_num; + size_t s; + + memset(&input, 0, sizeof(input)); + /* Serious errors, need to reset virtio crypto device */ + if (ret == -EFAULT) { + virtqueue_detach_element(vq, elem, 0); + goto out; + } else if (ret == -VIRTIO_CRYPTO_NOTSUPP) { + stl_le_p(&input.status, VIRTIO_CRYPTO_NOTSUPP); + } else if (ret == -VIRTIO_CRYPTO_KEY_REJECTED) { + stl_le_p(&input.status, VIRTIO_CRYPTO_KEY_REJECTED); + } else if (ret != VIRTIO_CRYPTO_OK) { + stl_le_p(&input.status, VIRTIO_CRYPTO_ERR); + } else { + /* Set the session id */ + stq_le_p(&input.session_id, sreq->info.session_id); + stl_le_p(&input.status, VIRTIO_CRYPTO_OK); + } + + s = iov_from_buf(in_iov, in_num, 0, &input, sizeof(input)); + if (unlikely(s != sizeof(input))) { + virtio_error(vdev, "virtio-crypto input incorrect"); + virtqueue_detach_element(vq, elem, 0); + goto out; + } + virtqueue_push(vq, elem, sizeof(input)); + virtio_notify(vdev, vq); + +out: + g_free(elem); + virtio_crypto_free_create_session_req(sreq); +} + +static void virtio_crypto_destroy_session_completion(void *opaque, int ret) +{ + VirtIOCryptoSessionReq *sreq = (VirtIOCryptoSessionReq *)opaque; + VirtQueue *vq = sreq->vq; + VirtQueueElement *elem = sreq->elem; + VirtIODevice *vdev = sreq->vdev; + struct iovec *in_iov = elem->in_sg; + unsigned in_num = elem->in_num; + uint8_t status; + size_t s; + + if (ret < 0) { + status = VIRTIO_CRYPTO_ERR; + } else { + status = VIRTIO_CRYPTO_OK; + } + s = iov_from_buf(in_iov, in_num, 0, &status, sizeof(status)); + if (unlikely(s != sizeof(status))) { + virtio_error(vdev, "virtio-crypto status incorrect"); + virtqueue_detach_element(vq, elem, 0); + goto out; + } + virtqueue_push(vq, elem, sizeof(status)); + virtio_notify(vdev, vq); + +out: + g_free(elem); + g_free(sreq); +} + +static void virtio_crypto_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + struct virtio_crypto_op_ctrl_req ctrl; + VirtQueueElement *elem; + VirtIOCryptoSessionReq *sreq; + unsigned out_num; + unsigned in_num; + uint32_t queue_id; + uint32_t opcode; + struct virtio_crypto_session_input input; + size_t s; + int ret; + struct iovec *out_iov; + struct iovec *in_iov; + + for (;;) { + g_autofree struct iovec *out_iov_copy = NULL; + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + break; + } + if (elem->out_num < 1 || elem->in_num < 1) { + virtio_error(vdev, "virtio-crypto ctrl missing headers"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + break; + } + + out_num = elem->out_num; + out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num); + out_iov = out_iov_copy; + + in_num = elem->in_num; + in_iov = elem->in_sg; + + if (unlikely(iov_to_buf(out_iov, out_num, 0, &ctrl, sizeof(ctrl)) + != sizeof(ctrl))) { + virtio_error(vdev, "virtio-crypto request ctrl_hdr too short"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + break; + } + iov_discard_front(&out_iov, &out_num, sizeof(ctrl)); + + opcode = ldl_le_p(&ctrl.header.opcode); + queue_id = ldl_le_p(&ctrl.header.queue_id); + + sreq = g_new0(VirtIOCryptoSessionReq, 1); + sreq->vdev = vdev; + sreq->vq = vq; + sreq->elem = elem; + + switch (opcode) { + case VIRTIO_CRYPTO_CIPHER_CREATE_SESSION: + sreq->cb = virtio_crypto_create_session_completion; + ret = virtio_crypto_create_sym_session(vcrypto, + &ctrl.u.sym_create_session, + queue_id, opcode, + out_iov, out_num, + sreq); + if (ret < 0) { + virtio_crypto_create_session_completion(sreq, ret); + } + break; + + case VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION: + sreq->cb = virtio_crypto_create_session_completion; + ret = virtio_crypto_create_asym_session(vcrypto, + &ctrl.u.akcipher_create_session, + queue_id, opcode, + out_iov, out_num, + sreq); + if (ret < 0) { + virtio_crypto_create_session_completion(sreq, ret); + } + break; + + case VIRTIO_CRYPTO_CIPHER_DESTROY_SESSION: + case VIRTIO_CRYPTO_HASH_DESTROY_SESSION: + case VIRTIO_CRYPTO_MAC_DESTROY_SESSION: + case VIRTIO_CRYPTO_AEAD_DESTROY_SESSION: + case VIRTIO_CRYPTO_AKCIPHER_DESTROY_SESSION: + sreq->cb = virtio_crypto_destroy_session_completion; + ret = virtio_crypto_handle_close_session(vcrypto, + &ctrl.u.destroy_session, queue_id, + sreq); + if (ret < 0) { + virtio_crypto_destroy_session_completion(sreq, ret); + } + break; + + case VIRTIO_CRYPTO_HASH_CREATE_SESSION: + case VIRTIO_CRYPTO_MAC_CREATE_SESSION: + case VIRTIO_CRYPTO_AEAD_CREATE_SESSION: + default: + memset(&input, 0, sizeof(input)); + error_report("virtio-crypto unsupported ctrl opcode: %d", opcode); + stl_le_p(&input.status, VIRTIO_CRYPTO_NOTSUPP); + s = iov_from_buf(in_iov, in_num, 0, &input, sizeof(input)); + if (unlikely(s != sizeof(input))) { + virtio_error(vdev, "virtio-crypto input incorrect"); + virtqueue_detach_element(vq, elem, 0); + } else { + virtqueue_push(vq, elem, sizeof(input)); + virtio_notify(vdev, vq); + } + g_free(sreq); + g_free(elem); + + break; + } /* end switch case */ + + } /* end for loop */ +} + +static void virtio_crypto_init_request(VirtIOCrypto *vcrypto, VirtQueue *vq, + VirtIOCryptoReq *req) +{ + req->vcrypto = vcrypto; + req->vq = vq; + req->in = NULL; + req->in_iov = NULL; + req->in_num = 0; + req->in_len = 0; + req->flags = CRYPTODEV_BACKEND_ALG__MAX; + memset(&req->op_info, 0x00, sizeof(req->op_info)); +} + +static void virtio_crypto_free_request(VirtIOCryptoReq *req) +{ + if (!req) { + return; + } + + if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) { + size_t max_len; + CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info; + + max_len = op_info->iv_len + + op_info->aad_len + + op_info->src_len + + op_info->dst_len + + op_info->digest_result_len; + + /* Zeroize and free request data structure */ + memset(op_info, 0, sizeof(*op_info) + max_len); + g_free(op_info); + } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) { + CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info; + if (op_info) { + g_free(op_info->src); + g_free(op_info->dst); + memset(op_info, 0, sizeof(*op_info)); + g_free(op_info); + } + } + + g_free(req->in_iov); + g_free(req); +} + +static void +virtio_crypto_sym_input_data_helper(VirtIODevice *vdev, + VirtIOCryptoReq *req, + uint32_t status, + CryptoDevBackendSymOpInfo *sym_op_info) +{ + size_t s, len; + struct iovec *in_iov = req->in_iov; + + if (status != VIRTIO_CRYPTO_OK) { + return; + } + + len = sym_op_info->src_len; + /* Save the cipher result */ + s = iov_from_buf(in_iov, req->in_num, 0, sym_op_info->dst, len); + if (s != len) { + virtio_error(vdev, "virtio-crypto dest data incorrect"); + return; + } + + iov_discard_front(&in_iov, &req->in_num, len); + + if (sym_op_info->op_type == + VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) { + /* Save the digest result */ + s = iov_from_buf(in_iov, req->in_num, 0, + sym_op_info->digest_result, + sym_op_info->digest_result_len); + if (s != sym_op_info->digest_result_len) { + virtio_error(vdev, "virtio-crypto digest result incorrect"); + } + } +} + +static void +virtio_crypto_akcipher_input_data_helper(VirtIODevice *vdev, + VirtIOCryptoReq *req, int32_t status, + CryptoDevBackendAsymOpInfo *asym_op_info) +{ + size_t s, len; + struct iovec *in_iov = req->in_iov; + + if (status != VIRTIO_CRYPTO_OK) { + return; + } + + len = asym_op_info->dst_len; + if (!len) { + return; + } + + s = iov_from_buf(in_iov, req->in_num, 0, asym_op_info->dst, len); + if (s != len) { + virtio_error(vdev, "virtio-crypto asym dest data incorrect"); + return; + } + + iov_discard_front(&in_iov, &req->in_num, len); + + /* For akcipher, dst_len may be changed after operation */ + req->in_len = sizeof(struct virtio_crypto_inhdr) + asym_op_info->dst_len; +} + +static void virtio_crypto_req_complete(void *opaque, int ret) +{ + VirtIOCryptoReq *req = (VirtIOCryptoReq *)opaque; + VirtIOCrypto *vcrypto = req->vcrypto; + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + uint8_t status = -ret; + + if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) { + virtio_crypto_sym_input_data_helper(vdev, req, status, + req->op_info.u.sym_op_info); + } else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) { + virtio_crypto_akcipher_input_data_helper(vdev, req, status, + req->op_info.u.asym_op_info); + } + stb_p(&req->in->status, status); + virtqueue_push(req->vq, &req->elem, req->in_len); + virtio_notify(vdev, req->vq); + virtio_crypto_free_request(req); +} + +static VirtIOCryptoReq * +virtio_crypto_get_request(VirtIOCrypto *s, VirtQueue *vq) +{ + VirtIOCryptoReq *req = virtqueue_pop(vq, sizeof(VirtIOCryptoReq)); + + if (req) { + virtio_crypto_init_request(s, vq, req); + } + return req; +} + +static CryptoDevBackendSymOpInfo * +virtio_crypto_sym_op_helper(VirtIODevice *vdev, + struct virtio_crypto_cipher_para *cipher_para, + struct virtio_crypto_alg_chain_data_para *alg_chain_para, + struct iovec *iov, unsigned int out_num) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + CryptoDevBackendSymOpInfo *op_info; + uint32_t src_len = 0, dst_len = 0; + uint32_t iv_len = 0; + uint32_t aad_len = 0, hash_result_len = 0; + uint32_t hash_start_src_offset = 0, len_to_hash = 0; + uint32_t cipher_start_src_offset = 0, len_to_cipher = 0; + + uint64_t max_len, curr_size = 0; + size_t s; + + /* Plain cipher */ + if (cipher_para) { + iv_len = ldl_le_p(&cipher_para->iv_len); + src_len = ldl_le_p(&cipher_para->src_data_len); + dst_len = ldl_le_p(&cipher_para->dst_data_len); + } else if (alg_chain_para) { /* Algorithm chain */ + iv_len = ldl_le_p(&alg_chain_para->iv_len); + src_len = ldl_le_p(&alg_chain_para->src_data_len); + dst_len = ldl_le_p(&alg_chain_para->dst_data_len); + + aad_len = ldl_le_p(&alg_chain_para->aad_len); + hash_result_len = ldl_le_p(&alg_chain_para->hash_result_len); + hash_start_src_offset = ldl_le_p( + &alg_chain_para->hash_start_src_offset); + cipher_start_src_offset = ldl_le_p( + &alg_chain_para->cipher_start_src_offset); + len_to_cipher = ldl_le_p(&alg_chain_para->len_to_cipher); + len_to_hash = ldl_le_p(&alg_chain_para->len_to_hash); + } else { + return NULL; + } + + max_len = (uint64_t)iv_len + aad_len + src_len + dst_len + hash_result_len; + if (unlikely(max_len > vcrypto->conf.max_size)) { + virtio_error(vdev, "virtio-crypto too big length"); + return NULL; + } + + op_info = g_malloc0(sizeof(CryptoDevBackendSymOpInfo) + max_len); + op_info->iv_len = iv_len; + op_info->src_len = src_len; + op_info->dst_len = dst_len; + op_info->aad_len = aad_len; + op_info->digest_result_len = hash_result_len; + op_info->hash_start_src_offset = hash_start_src_offset; + op_info->len_to_hash = len_to_hash; + op_info->cipher_start_src_offset = cipher_start_src_offset; + op_info->len_to_cipher = len_to_cipher; + /* Handle the initilization vector */ + if (op_info->iv_len > 0) { + DPRINTF("iv_len=%" PRIu32 "\n", op_info->iv_len); + op_info->iv = op_info->data + curr_size; + + s = iov_to_buf(iov, out_num, 0, op_info->iv, op_info->iv_len); + if (unlikely(s != op_info->iv_len)) { + virtio_error(vdev, "virtio-crypto iv incorrect"); + goto err; + } + iov_discard_front(&iov, &out_num, op_info->iv_len); + curr_size += op_info->iv_len; + } + + /* Handle additional authentication data if exists */ + if (op_info->aad_len > 0) { + DPRINTF("aad_len=%" PRIu32 "\n", op_info->aad_len); + op_info->aad_data = op_info->data + curr_size; + + s = iov_to_buf(iov, out_num, 0, op_info->aad_data, op_info->aad_len); + if (unlikely(s != op_info->aad_len)) { + virtio_error(vdev, "virtio-crypto additional auth data incorrect"); + goto err; + } + iov_discard_front(&iov, &out_num, op_info->aad_len); + + curr_size += op_info->aad_len; + } + + /* Handle the source data */ + if (op_info->src_len > 0) { + DPRINTF("src_len=%" PRIu32 "\n", op_info->src_len); + op_info->src = op_info->data + curr_size; + + s = iov_to_buf(iov, out_num, 0, op_info->src, op_info->src_len); + if (unlikely(s != op_info->src_len)) { + virtio_error(vdev, "virtio-crypto source data incorrect"); + goto err; + } + iov_discard_front(&iov, &out_num, op_info->src_len); + + curr_size += op_info->src_len; + } + + /* Handle the destination data */ + op_info->dst = op_info->data + curr_size; + curr_size += op_info->dst_len; + + DPRINTF("dst_len=%" PRIu32 "\n", op_info->dst_len); + + /* Handle the hash digest result */ + if (hash_result_len > 0) { + DPRINTF("hash_result_len=%" PRIu32 "\n", hash_result_len); + op_info->digest_result = op_info->data + curr_size; + } + + return op_info; + +err: + g_free(op_info); + return NULL; +} + +static int +virtio_crypto_handle_sym_req(VirtIOCrypto *vcrypto, + struct virtio_crypto_sym_data_req *req, + CryptoDevBackendOpInfo *op_info, + struct iovec *iov, unsigned int out_num) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + CryptoDevBackendSymOpInfo *sym_op_info; + uint32_t op_type; + + op_type = ldl_le_p(&req->op_type); + if (op_type == VIRTIO_CRYPTO_SYM_OP_CIPHER) { + sym_op_info = virtio_crypto_sym_op_helper(vdev, &req->u.cipher.para, + NULL, iov, out_num); + if (!sym_op_info) { + return -EFAULT; + } + } else if (op_type == VIRTIO_CRYPTO_SYM_OP_ALGORITHM_CHAINING) { + sym_op_info = virtio_crypto_sym_op_helper(vdev, NULL, + &req->u.chain.para, + iov, out_num); + if (!sym_op_info) { + return -EFAULT; + } + } else { + /* VIRTIO_CRYPTO_SYM_OP_NONE */ + error_report("virtio-crypto unsupported cipher type"); + return -VIRTIO_CRYPTO_NOTSUPP; + } + + sym_op_info->op_type = op_type; + op_info->u.sym_op_info = sym_op_info; + + return 0; +} + +static int +virtio_crypto_handle_asym_req(VirtIOCrypto *vcrypto, + struct virtio_crypto_akcipher_data_req *req, + CryptoDevBackendOpInfo *op_info, + struct iovec *iov, unsigned int out_num) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + CryptoDevBackendAsymOpInfo *asym_op_info; + uint32_t src_len; + uint32_t dst_len; + uint32_t len; + uint8_t *src = NULL; + uint8_t *dst = NULL; + + asym_op_info = g_new0(CryptoDevBackendAsymOpInfo, 1); + src_len = ldl_le_p(&req->para.src_data_len); + dst_len = ldl_le_p(&req->para.dst_data_len); + + if (src_len > 0) { + src = g_malloc0(src_len); + len = iov_to_buf(iov, out_num, 0, src, src_len); + if (unlikely(len != src_len)) { + virtio_error(vdev, "virtio-crypto asym src data incorrect" + "expected %u, actual %u", src_len, len); + goto err; + } + + iov_discard_front(&iov, &out_num, src_len); + } + + if (dst_len > 0) { + dst = g_malloc0(dst_len); + + if (op_info->op_code == VIRTIO_CRYPTO_AKCIPHER_VERIFY) { + len = iov_to_buf(iov, out_num, 0, dst, dst_len); + if (unlikely(len != dst_len)) { + virtio_error(vdev, "virtio-crypto asym dst data incorrect" + "expected %u, actual %u", dst_len, len); + goto err; + } + + iov_discard_front(&iov, &out_num, dst_len); + } + } + + asym_op_info->src_len = src_len; + asym_op_info->dst_len = dst_len; + asym_op_info->src = src; + asym_op_info->dst = dst; + op_info->u.asym_op_info = asym_op_info; + + return 0; + + err: + g_free(asym_op_info); + g_free(src); + g_free(dst); + + return -EFAULT; +} + +static int +virtio_crypto_handle_request(VirtIOCryptoReq *request) +{ + VirtIOCrypto *vcrypto = request->vcrypto; + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + VirtQueueElement *elem = &request->elem; + int queue_index = virtio_crypto_vq2q(virtio_get_queue_index(request->vq)); + struct virtio_crypto_op_data_req req; + int ret; + g_autofree struct iovec *in_iov_copy = NULL; + g_autofree struct iovec *out_iov_copy = NULL; + struct iovec *in_iov; + struct iovec *out_iov; + unsigned in_num; + unsigned out_num; + uint32_t opcode; + CryptoDevBackendOpInfo *op_info = &request->op_info; + + if (elem->out_num < 1 || elem->in_num < 1) { + virtio_error(vdev, "virtio-crypto dataq missing headers"); + return -1; + } + + out_num = elem->out_num; + out_iov_copy = g_memdup2(elem->out_sg, sizeof(out_iov[0]) * out_num); + out_iov = out_iov_copy; + + in_num = elem->in_num; + in_iov_copy = g_memdup2(elem->in_sg, sizeof(in_iov[0]) * in_num); + in_iov = in_iov_copy; + + if (unlikely(iov_to_buf(out_iov, out_num, 0, &req, sizeof(req)) + != sizeof(req))) { + virtio_error(vdev, "virtio-crypto request outhdr too short"); + return -1; + } + iov_discard_front(&out_iov, &out_num, sizeof(req)); + + if (in_iov[in_num - 1].iov_len < + sizeof(struct virtio_crypto_inhdr)) { + virtio_error(vdev, "virtio-crypto request inhdr too short"); + return -1; + } + /* We always touch the last byte, so just see how big in_iov is. */ + request->in_len = iov_size(in_iov, in_num); + request->in = (void *)in_iov[in_num - 1].iov_base + + in_iov[in_num - 1].iov_len + - sizeof(struct virtio_crypto_inhdr); + iov_discard_back(in_iov, &in_num, sizeof(struct virtio_crypto_inhdr)); + + /* + * The length of operation result, including dest_data + * and digest_result if exists. + */ + request->in_num = in_num; + request->in_iov = in_iov; + /* now, we free the in_iov_copy inside virtio_crypto_free_request */ + in_iov_copy = NULL; + + opcode = ldl_le_p(&req.header.opcode); + op_info->session_id = ldq_le_p(&req.header.session_id); + op_info->op_code = opcode; + + switch (opcode) { + case VIRTIO_CRYPTO_CIPHER_ENCRYPT: + case VIRTIO_CRYPTO_CIPHER_DECRYPT: + op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_SYM; + ret = virtio_crypto_handle_sym_req(vcrypto, + &req.u.sym_req, op_info, + out_iov, out_num); + goto check_result; + + case VIRTIO_CRYPTO_AKCIPHER_ENCRYPT: + case VIRTIO_CRYPTO_AKCIPHER_DECRYPT: + case VIRTIO_CRYPTO_AKCIPHER_SIGN: + case VIRTIO_CRYPTO_AKCIPHER_VERIFY: + op_info->algtype = request->flags = CRYPTODEV_BACKEND_ALG_ASYM; + ret = virtio_crypto_handle_asym_req(vcrypto, + &req.u.akcipher_req, op_info, + out_iov, out_num); + +check_result: + /* Serious errors, need to reset virtio crypto device */ + if (ret == -EFAULT) { + return -1; + } else if (ret == -VIRTIO_CRYPTO_NOTSUPP) { + virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP); + } else { + ret = cryptodev_backend_crypto_operation(vcrypto->cryptodev, + request, queue_index, + virtio_crypto_req_complete, + request); + if (ret < 0) { + virtio_crypto_req_complete(request, ret); + } + } + break; + + case VIRTIO_CRYPTO_HASH: + case VIRTIO_CRYPTO_MAC: + case VIRTIO_CRYPTO_AEAD_ENCRYPT: + case VIRTIO_CRYPTO_AEAD_DECRYPT: + default: + error_report("virtio-crypto unsupported dataq opcode: %u", + opcode); + virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP); + } + + return 0; +} + +static void virtio_crypto_handle_dataq(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + VirtIOCryptoReq *req; + + while ((req = virtio_crypto_get_request(vcrypto, vq))) { + if (virtio_crypto_handle_request(req) < 0) { + virtqueue_detach_element(req->vq, &req->elem, 0); + virtio_crypto_free_request(req); + break; + } + } +} + +static void virtio_crypto_dataq_bh(void *opaque) +{ + VirtIOCryptoQueue *q = opaque; + VirtIOCrypto *vcrypto = q->vcrypto; + VirtIODevice *vdev = VIRTIO_DEVICE(vcrypto); + + /* This happens when device was stopped but BH wasn't. */ + if (!vdev->vm_running) { + return; + } + + /* Just in case the driver is not ready on more */ + if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) { + return; + } + + for (;;) { + virtio_crypto_handle_dataq(vdev, q->dataq); + virtio_queue_set_notification(q->dataq, 1); + + /* Are we done or did the guest add more buffers? */ + if (virtio_queue_empty(q->dataq)) { + break; + } + + virtio_queue_set_notification(q->dataq, 0); + } +} + +static void +virtio_crypto_handle_dataq_bh(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + VirtIOCryptoQueue *q = + &vcrypto->vqs[virtio_crypto_vq2q(virtio_get_queue_index(vq))]; + + /* This happens when device was stopped but VCPU wasn't. */ + if (!vdev->vm_running) { + return; + } + virtio_queue_set_notification(vq, 0); + qemu_bh_schedule(q->dataq_bh); +} + +static uint64_t virtio_crypto_get_features(VirtIODevice *vdev, + uint64_t features, + Error **errp) +{ + return features; +} + +static void virtio_crypto_reset(VirtIODevice *vdev) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + /* multiqueue is disabled by default */ + vcrypto->curr_queues = 1; + if (!cryptodev_backend_is_ready(vcrypto->cryptodev)) { + vcrypto->status &= ~VIRTIO_CRYPTO_S_HW_READY; + } else { + vcrypto->status |= VIRTIO_CRYPTO_S_HW_READY; + } +} + +static void virtio_crypto_init_config(VirtIODevice *vdev) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + + vcrypto->conf.crypto_services = + vcrypto->conf.cryptodev->conf.crypto_services; + vcrypto->conf.cipher_algo_l = + vcrypto->conf.cryptodev->conf.cipher_algo_l; + vcrypto->conf.cipher_algo_h = + vcrypto->conf.cryptodev->conf.cipher_algo_h; + vcrypto->conf.hash_algo = vcrypto->conf.cryptodev->conf.hash_algo; + vcrypto->conf.mac_algo_l = vcrypto->conf.cryptodev->conf.mac_algo_l; + vcrypto->conf.mac_algo_h = vcrypto->conf.cryptodev->conf.mac_algo_h; + vcrypto->conf.aead_algo = vcrypto->conf.cryptodev->conf.aead_algo; + vcrypto->conf.akcipher_algo = vcrypto->conf.cryptodev->conf.akcipher_algo; + vcrypto->conf.max_cipher_key_len = + vcrypto->conf.cryptodev->conf.max_cipher_key_len; + vcrypto->conf.max_auth_key_len = + vcrypto->conf.cryptodev->conf.max_auth_key_len; + vcrypto->conf.max_size = vcrypto->conf.cryptodev->conf.max_size; +} + +static void virtio_crypto_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev); + int i; + + vcrypto->cryptodev = vcrypto->conf.cryptodev; + if (vcrypto->cryptodev == NULL) { + error_setg(errp, "'cryptodev' parameter expects a valid object"); + return; + } else if (cryptodev_backend_is_used(vcrypto->cryptodev)) { + error_setg(errp, "can't use already used cryptodev backend: %s", + object_get_canonical_path_component(OBJECT(vcrypto->conf.cryptodev))); + return; + } + + vcrypto->max_queues = MAX(vcrypto->cryptodev->conf.peers.queues, 1); + if (vcrypto->max_queues + 1 > VIRTIO_QUEUE_MAX) { + error_setg(errp, "Invalid number of queues (= %" PRIu32 "), " + "must be a positive integer less than %d.", + vcrypto->max_queues, VIRTIO_QUEUE_MAX); + return; + } + + virtio_init(vdev, VIRTIO_ID_CRYPTO, vcrypto->config_size); + vcrypto->curr_queues = 1; + vcrypto->vqs = g_new0(VirtIOCryptoQueue, vcrypto->max_queues); + for (i = 0; i < vcrypto->max_queues; i++) { + vcrypto->vqs[i].dataq = + virtio_add_queue(vdev, 1024, virtio_crypto_handle_dataq_bh); + vcrypto->vqs[i].dataq_bh = + qemu_bh_new(virtio_crypto_dataq_bh, &vcrypto->vqs[i]); + vcrypto->vqs[i].vcrypto = vcrypto; + } + + vcrypto->ctrl_vq = virtio_add_queue(vdev, 1024, virtio_crypto_handle_ctrl); + if (!cryptodev_backend_is_ready(vcrypto->cryptodev)) { + vcrypto->status &= ~VIRTIO_CRYPTO_S_HW_READY; + } else { + vcrypto->status |= VIRTIO_CRYPTO_S_HW_READY; + } + + virtio_crypto_init_config(vdev); + cryptodev_backend_set_used(vcrypto->cryptodev, true); +} + +static void virtio_crypto_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(dev); + VirtIOCryptoQueue *q; + int i, max_queues; + + max_queues = vcrypto->multiqueue ? vcrypto->max_queues : 1; + for (i = 0; i < max_queues; i++) { + virtio_delete_queue(vcrypto->vqs[i].dataq); + q = &vcrypto->vqs[i]; + qemu_bh_delete(q->dataq_bh); + } + + g_free(vcrypto->vqs); + virtio_delete_queue(vcrypto->ctrl_vq); + + virtio_cleanup(vdev); + cryptodev_backend_set_used(vcrypto->cryptodev, false); +} + +static const VMStateDescription vmstate_virtio_crypto = { + .name = "virtio-crypto", + .unmigratable = 1, + .minimum_version_id = VIRTIO_CRYPTO_VM_VERSION, + .version_id = VIRTIO_CRYPTO_VM_VERSION, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static Property virtio_crypto_properties[] = { + DEFINE_PROP_LINK("cryptodev", VirtIOCrypto, conf.cryptodev, + TYPE_CRYPTODEV_BACKEND, CryptoDevBackend *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_crypto_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VirtIOCrypto *c = VIRTIO_CRYPTO(vdev); + struct virtio_crypto_config crypto_cfg = {}; + + /* + * Virtio-crypto device conforms to VIRTIO 1.0 which is always LE, + * so we can use LE accessors directly. + */ + stl_le_p(&crypto_cfg.status, c->status); + stl_le_p(&crypto_cfg.max_dataqueues, c->max_queues); + stl_le_p(&crypto_cfg.crypto_services, c->conf.crypto_services); + stl_le_p(&crypto_cfg.cipher_algo_l, c->conf.cipher_algo_l); + stl_le_p(&crypto_cfg.cipher_algo_h, c->conf.cipher_algo_h); + stl_le_p(&crypto_cfg.hash_algo, c->conf.hash_algo); + stl_le_p(&crypto_cfg.mac_algo_l, c->conf.mac_algo_l); + stl_le_p(&crypto_cfg.mac_algo_h, c->conf.mac_algo_h); + stl_le_p(&crypto_cfg.aead_algo, c->conf.aead_algo); + stl_le_p(&crypto_cfg.max_cipher_key_len, c->conf.max_cipher_key_len); + stl_le_p(&crypto_cfg.max_auth_key_len, c->conf.max_auth_key_len); + stq_le_p(&crypto_cfg.max_size, c->conf.max_size); + stl_le_p(&crypto_cfg.akcipher_algo, c->conf.akcipher_algo); + + memcpy(config, &crypto_cfg, c->config_size); +} + +static bool virtio_crypto_started(VirtIOCrypto *c, uint8_t status) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(c); + return (status & VIRTIO_CONFIG_S_DRIVER_OK) && + (c->status & VIRTIO_CRYPTO_S_HW_READY) && vdev->vm_running; +} + +static void virtio_crypto_vhost_status(VirtIOCrypto *c, uint8_t status) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(c); + int queues = c->multiqueue ? c->max_queues : 1; + CryptoDevBackend *b = c->cryptodev; + CryptoDevBackendClient *cc = b->conf.peers.ccs[0]; + + if (!cryptodev_get_vhost(cc, b, 0)) { + return; + } + + if ((virtio_crypto_started(c, status)) == !!c->vhost_started) { + return; + } + + if (!c->vhost_started) { + int r; + + c->vhost_started = 1; + r = cryptodev_vhost_start(vdev, queues); + if (r < 0) { + error_report("unable to start vhost crypto: %d: " + "falling back on userspace virtio", -r); + c->vhost_started = 0; + } + } else { + cryptodev_vhost_stop(vdev, queues); + c->vhost_started = 0; + } +} + +static void virtio_crypto_set_status(VirtIODevice *vdev, uint8_t status) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + + virtio_crypto_vhost_status(vcrypto, status); +} + +static void virtio_crypto_guest_notifier_mask(VirtIODevice *vdev, int idx, + bool mask) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + int queue = virtio_crypto_vq2q(idx); + + assert(vcrypto->vhost_started); + + cryptodev_vhost_virtqueue_mask(vdev, queue, idx, mask); +} + +static bool virtio_crypto_guest_notifier_pending(VirtIODevice *vdev, int idx) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + int queue = virtio_crypto_vq2q(idx); + + assert(vcrypto->vhost_started); + + return cryptodev_vhost_virtqueue_pending(vdev, queue, idx); +} + +static struct vhost_dev *virtio_crypto_get_vhost(VirtIODevice *vdev) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev); + CryptoDevBackend *b = vcrypto->cryptodev; + CryptoDevBackendClient *cc = b->conf.peers.ccs[0]; + CryptoDevBackendVhost *vhost_crypto = cryptodev_get_vhost(cc, b, 0); + return &vhost_crypto->dev; +} + +static void virtio_crypto_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_crypto_properties); + dc->vmsd = &vmstate_virtio_crypto; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_crypto_device_realize; + vdc->unrealize = virtio_crypto_device_unrealize; + vdc->get_config = virtio_crypto_get_config; + vdc->get_features = virtio_crypto_get_features; + vdc->reset = virtio_crypto_reset; + vdc->set_status = virtio_crypto_set_status; + vdc->guest_notifier_mask = virtio_crypto_guest_notifier_mask; + vdc->guest_notifier_pending = virtio_crypto_guest_notifier_pending; + vdc->get_vhost = virtio_crypto_get_vhost; +} + +static void virtio_crypto_instance_init(Object *obj) +{ + VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(obj); + + /* + * The default config_size is sizeof(struct virtio_crypto_config). + * Can be overriden with virtio_crypto_set_config_size. + */ + vcrypto->config_size = sizeof(struct virtio_crypto_config); +} + +static const TypeInfo virtio_crypto_info = { + .name = TYPE_VIRTIO_CRYPTO, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOCrypto), + .instance_init = virtio_crypto_instance_init, + .class_init = virtio_crypto_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_crypto_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-input-host-pci.c b/hw/virtio/virtio-input-host-pci.c new file mode 100644 index 00000000..cf8a9cf9 --- /dev/null +++ b/hw/virtio/virtio-input-host-pci.c @@ -0,0 +1,47 @@ +/* + * Virtio input host PCI Bindings + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-input.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIOInputHostPCI VirtIOInputHostPCI; + +#define TYPE_VIRTIO_INPUT_HOST_PCI "virtio-input-host-pci" +DECLARE_INSTANCE_CHECKER(VirtIOInputHostPCI, VIRTIO_INPUT_HOST_PCI, + TYPE_VIRTIO_INPUT_HOST_PCI) + +struct VirtIOInputHostPCI { + VirtIOPCIProxy parent_obj; + VirtIOInputHost vdev; +}; + +static void virtio_host_initfn(Object *obj) +{ + VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_INPUT_HOST); +} + +static const VirtioPCIDeviceTypeInfo virtio_input_host_pci_info = { + .generic_name = TYPE_VIRTIO_INPUT_HOST_PCI, + .parent = TYPE_VIRTIO_INPUT_PCI, + .instance_size = sizeof(VirtIOInputHostPCI), + .instance_init = virtio_host_initfn, +}; + +static void virtio_input_host_pci_register(void) +{ + virtio_pci_types_register(&virtio_input_host_pci_info); +} + +type_init(virtio_input_host_pci_register) diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c new file mode 100644 index 00000000..a9d09923 --- /dev/null +++ b/hw/virtio/virtio-input-pci.c @@ -0,0 +1,155 @@ +/* + * Virtio input PCI Bindings + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-input.h" +#include "qemu/module.h" +#include "qom/object.h" + + +/* + * virtio-input-pci: This extends VirtioPCIProxy. + */ +OBJECT_DECLARE_SIMPLE_TYPE(VirtIOInputPCI, VIRTIO_INPUT_PCI) + +struct VirtIOInputPCI { + VirtIOPCIProxy parent_obj; + VirtIOInput vdev; +}; + +#define TYPE_VIRTIO_INPUT_HID_PCI "virtio-input-hid-pci" +#define TYPE_VIRTIO_KEYBOARD_PCI "virtio-keyboard-pci" +#define TYPE_VIRTIO_MOUSE_PCI "virtio-mouse-pci" +#define TYPE_VIRTIO_TABLET_PCI "virtio-tablet-pci" +OBJECT_DECLARE_SIMPLE_TYPE(VirtIOInputHIDPCI, VIRTIO_INPUT_HID_PCI) + +struct VirtIOInputHIDPCI { + VirtIOPCIProxy parent_obj; + VirtIOInputHID vdev; +}; + +static Property virtio_input_pci_properties[] = { + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_input_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOInputPCI *vinput = VIRTIO_INPUT_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&vinput->vdev); + + virtio_pci_force_virtio_1(vpci_dev); + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_input_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_input_pci_properties); + k->realize = virtio_input_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + + pcidev_k->class_id = PCI_CLASS_INPUT_OTHER; +} + +static void virtio_input_hid_kbd_pci_class_init(ObjectClass *klass, void *data) +{ + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + pcidev_k->class_id = PCI_CLASS_INPUT_KEYBOARD; +} + +static void virtio_input_hid_mouse_pci_class_init(ObjectClass *klass, + void *data) +{ + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + pcidev_k->class_id = PCI_CLASS_INPUT_MOUSE; +} + +static void virtio_keyboard_initfn(Object *obj) +{ + VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_KEYBOARD); +} + +static void virtio_mouse_initfn(Object *obj) +{ + VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_MOUSE); +} + +static void virtio_tablet_initfn(Object *obj) +{ + VirtIOInputHIDPCI *dev = VIRTIO_INPUT_HID_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_TABLET); +} + +static const TypeInfo virtio_input_pci_info = { + .name = TYPE_VIRTIO_INPUT_PCI, + .parent = TYPE_VIRTIO_PCI, + .instance_size = sizeof(VirtIOInputPCI), + .class_init = virtio_input_pci_class_init, + .abstract = true, +}; + +static const TypeInfo virtio_input_hid_pci_info = { + .name = TYPE_VIRTIO_INPUT_HID_PCI, + .parent = TYPE_VIRTIO_INPUT_PCI, + .instance_size = sizeof(VirtIOInputHIDPCI), + .abstract = true, +}; + +static const VirtioPCIDeviceTypeInfo virtio_keyboard_pci_info = { + .generic_name = TYPE_VIRTIO_KEYBOARD_PCI, + .parent = TYPE_VIRTIO_INPUT_HID_PCI, + .class_init = virtio_input_hid_kbd_pci_class_init, + .instance_size = sizeof(VirtIOInputHIDPCI), + .instance_init = virtio_keyboard_initfn, +}; + +static const VirtioPCIDeviceTypeInfo virtio_mouse_pci_info = { + .generic_name = TYPE_VIRTIO_MOUSE_PCI, + .parent = TYPE_VIRTIO_INPUT_HID_PCI, + .class_init = virtio_input_hid_mouse_pci_class_init, + .instance_size = sizeof(VirtIOInputHIDPCI), + .instance_init = virtio_mouse_initfn, +}; + +static const VirtioPCIDeviceTypeInfo virtio_tablet_pci_info = { + .generic_name = TYPE_VIRTIO_TABLET_PCI, + .parent = TYPE_VIRTIO_INPUT_HID_PCI, + .instance_size = sizeof(VirtIOInputHIDPCI), + .instance_init = virtio_tablet_initfn, +}; + +static void virtio_pci_input_register(void) +{ + /* Base types: */ + type_register_static(&virtio_input_pci_info); + type_register_static(&virtio_input_hid_pci_info); + + /* Implementations: */ + virtio_pci_types_register(&virtio_keyboard_pci_info); + virtio_pci_types_register(&virtio_mouse_pci_info); + virtio_pci_types_register(&virtio_tablet_pci_info); +} + +type_init(virtio_pci_input_register) diff --git a/hw/virtio/virtio-iommu-pci.c b/hw/virtio/virtio-iommu-pci.c new file mode 100644 index 00000000..7ef2f9dc --- /dev/null +++ b/hw/virtio/virtio-iommu-pci.c @@ -0,0 +1,112 @@ +/* + * Virtio IOMMU PCI Bindings + * + * Copyright (c) 2019 Red Hat, Inc. + * Written by Eric Auger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 or + * (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-iommu.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-properties-system.h" +#include "qapi/error.h" +#include "hw/boards.h" +#include "hw/pci/pci_bus.h" +#include "qom/object.h" + +typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI; + +/* + * virtio-iommu-pci: This extends VirtioPCIProxy. + * + */ +DECLARE_INSTANCE_CHECKER(VirtIOIOMMUPCI, VIRTIO_IOMMU_PCI, + TYPE_VIRTIO_IOMMU_PCI) + +struct VirtIOIOMMUPCI { + VirtIOPCIProxy parent_obj; + VirtIOIOMMU vdev; +}; + +static Property virtio_iommu_pci_properties[] = { + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_ARRAY("reserved-regions", VirtIOIOMMUPCI, + vdev.nb_reserved_regions, vdev.reserved_regions, + qdev_prop_reserved_region, ReservedRegion), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev); + PCIBus *pbus = pci_get_bus(&vpci_dev->pci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + + if (!qdev_get_machine_hotplug_handler(DEVICE(vpci_dev))) { + error_setg(errp, "Check your machine implements a hotplug handler " + "for the virtio-iommu-pci device"); + return; + } + for (int i = 0; i < s->nb_reserved_regions; i++) { + if (s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_RESERVED && + s->reserved_regions[i].type != VIRTIO_IOMMU_RESV_MEM_T_MSI) { + error_setg(errp, "reserved region %d has an invalid type", i); + error_append_hint(errp, "Valid values are 0 and 1\n"); + return; + } + } + if (!pci_bus_is_root(pbus)) { + error_setg(errp, "virtio-iommu-pci must be plugged on the root bus"); + return; + } + + object_property_set_link(OBJECT(dev), "primary-bus", + OBJECT(pbus), &error_abort); + + virtio_pci_force_virtio_1(vpci_dev); + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = virtio_iommu_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, virtio_iommu_pci_properties); + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; + dc->hotpluggable = false; +} + +static void virtio_iommu_pci_instance_init(Object *obj) +{ + VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_IOMMU); +} + +static const VirtioPCIDeviceTypeInfo virtio_iommu_pci_info = { + .generic_name = TYPE_VIRTIO_IOMMU_PCI, + .instance_size = sizeof(VirtIOIOMMUPCI), + .instance_init = virtio_iommu_pci_instance_init, + .class_init = virtio_iommu_pci_class_init, +}; + +static void virtio_iommu_pci_register(void) +{ + virtio_pci_types_register(&virtio_iommu_pci_info); +} + +type_init(virtio_iommu_pci_register) + + diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c new file mode 100644 index 00000000..62e07ec2 --- /dev/null +++ b/hw/virtio/virtio-iommu.c @@ -0,0 +1,1425 @@ +/* + * virtio-iommu device + * + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2 or later, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qemu/iov.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio.h" +#include "sysemu/kvm.h" +#include "sysemu/reset.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "trace.h" + +#include "standard-headers/linux/virtio_ids.h" + +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "hw/virtio/virtio-iommu.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci.h" + +/* Max size */ +#define VIOMMU_DEFAULT_QUEUE_SIZE 256 +#define VIOMMU_PROBE_SIZE 512 + +typedef struct VirtIOIOMMUDomain { + uint32_t id; + bool bypass; + GTree *mappings; + QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list; +} VirtIOIOMMUDomain; + +typedef struct VirtIOIOMMUEndpoint { + uint32_t id; + VirtIOIOMMUDomain *domain; + IOMMUMemoryRegion *iommu_mr; + QLIST_ENTRY(VirtIOIOMMUEndpoint) next; +} VirtIOIOMMUEndpoint; + +typedef struct VirtIOIOMMUInterval { + uint64_t low; + uint64_t high; +} VirtIOIOMMUInterval; + +typedef struct VirtIOIOMMUMapping { + uint64_t phys_addr; + uint32_t flags; +} VirtIOIOMMUMapping; + +static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) +{ + return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); +} + +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) +{ + uint32_t sid; + bool bypassed; + VirtIOIOMMU *s = sdev->viommu; + VirtIOIOMMUEndpoint *ep; + + sid = virtio_iommu_get_bdf(sdev); + + qemu_rec_mutex_lock(&s->mutex); + /* need to check bypass before system reset */ + if (!s->endpoints) { + bypassed = s->config.bypass; + goto unlock; + } + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + if (!ep || !ep->domain) { + bypassed = s->config.bypass; + } else { + bypassed = ep->domain->bypass; + } + +unlock: + qemu_rec_mutex_unlock(&s->mutex); + return bypassed; +} + +/* Return whether the device is using IOMMU translation. */ +static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) +{ + bool use_remapping; + + assert(sdev); + + use_remapping = !virtio_iommu_device_bypassed(sdev); + + trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus), + PCI_SLOT(sdev->devfn), + PCI_FUNC(sdev->devfn), + use_remapping); + + /* Turn off first then on the other */ + if (use_remapping) { + memory_region_set_enabled(&sdev->bypass_mr, false); + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true); + } else { + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false); + memory_region_set_enabled(&sdev->bypass_mr, true); + } + + return use_remapping; +} + +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) +{ + GHashTableIter iter; + IOMMUPciBus *iommu_pci_bus; + int i; + + g_hash_table_iter_init(&iter, s->as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) { + for (i = 0; i < PCI_DEVFN_MAX; i++) { + if (!iommu_pci_bus->pbdev[i]) { + continue; + } + virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]); + } + } +} + +/** + * The bus number is used for lookup when SID based operations occur. + * In that case we lazily populate the IOMMUPciBus array from the bus hash + * table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus + * numbers may not be always initialized yet. + */ +static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num) +{ + IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num]; + + if (!iommu_pci_bus) { + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->as_by_busptr); + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) { + if (pci_bus_num(iommu_pci_bus->bus) == bus_num) { + s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus; + return iommu_pci_bus; + } + } + return NULL; + } + return iommu_pci_bus; +} + +static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid) +{ + uint8_t bus_n, devfn; + IOMMUPciBus *iommu_pci_bus; + IOMMUDevice *dev; + + bus_n = PCI_BUS_NUM(sid); + iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n); + if (iommu_pci_bus) { + devfn = sid & (PCI_DEVFN_MAX - 1); + dev = iommu_pci_bus->pbdev[devfn]; + if (dev) { + return &dev->iommu_mr; + } + } + return NULL; +} + +static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a; + VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b; + + if (inta->high < intb->low) { + return -1; + } else if (intb->high < inta->low) { + return 1; + } else { + return 0; + } +} + +static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr, + IOMMUTLBEvent *event, + hwaddr virt_start, hwaddr virt_end) +{ + uint64_t delta = virt_end - virt_start; + + event->entry.iova = virt_start; + event->entry.addr_mask = delta; + + if (delta == UINT64_MAX) { + memory_region_notify_iommu(mr, 0, *event); + } + + while (virt_start != virt_end + 1) { + uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64); + + event->entry.addr_mask = mask; + event->entry.iova = virt_start; + memory_region_notify_iommu(mr, 0, *event); + virt_start += mask + 1; + if (event->entry.perm != IOMMU_NONE) { + event->entry.translated_addr += mask + 1; + } + } +} + +static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end, hwaddr paddr, + uint32_t flags) +{ + IOMMUTLBEvent event; + IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ, + flags & VIRTIO_IOMMU_MAP_F_WRITE); + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) || + (flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) { + return; + } + + trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end, + paddr, perm); + + event.type = IOMMU_NOTIFIER_MAP; + event.entry.target_as = &address_space_memory; + event.entry.perm = perm; + event.entry.translated_addr = paddr; + + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); +} + +static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, + hwaddr virt_end) +{ + IOMMUTLBEvent event; + + if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { + return; + } + + trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end); + + event.type = IOMMU_NOTIFIER_UNMAP; + event.entry.target_as = &address_space_memory; + event.entry.perm = IOMMU_NONE; + event.entry.translated_addr = 0; + + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); +} + +static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + virtio_iommu_notify_unmap(mr, interval->low, interval->high); + + return false; +} + +static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value; + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + virtio_iommu_notify_map(mr, interval->low, interval->high, + mapping->phys_addr, mapping->flags); + + return false; +} + +static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) +{ + VirtIOIOMMUDomain *domain = ep->domain; + IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); + + if (!ep->domain) { + return; + } + g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb, + ep->iommu_mr); + QLIST_REMOVE(ep, next); + ep->domain = NULL; + virtio_iommu_switch_address_space(sdev); +} + +static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, + uint32_t ep_id) +{ + VirtIOIOMMUEndpoint *ep; + IOMMUMemoryRegion *mr; + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id)); + if (ep) { + return ep; + } + mr = virtio_iommu_mr(s, ep_id); + if (!mr) { + return NULL; + } + ep = g_malloc0(sizeof(*ep)); + ep->id = ep_id; + ep->iommu_mr = mr; + trace_virtio_iommu_get_endpoint(ep_id); + g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep); + return ep; +} + +static void virtio_iommu_put_endpoint(gpointer data) +{ + VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data; + + if (ep->domain) { + virtio_iommu_detach_endpoint_from_domain(ep); + } + + trace_virtio_iommu_put_endpoint(ep->id); + g_free(ep); +} + +static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s, + uint32_t domain_id, + bool bypass) +{ + VirtIOIOMMUDomain *domain; + + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (domain) { + if (domain->bypass != bypass) { + return NULL; + } + return domain; + } + domain = g_malloc0(sizeof(*domain)); + domain->id = domain_id; + domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp, + NULL, (GDestroyNotify)g_free, + (GDestroyNotify)g_free); + domain->bypass = bypass; + g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain); + QLIST_INIT(&domain->endpoint_list); + trace_virtio_iommu_get_domain(domain_id); + return domain; +} + +static void virtio_iommu_put_domain(gpointer data) +{ + VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data; + VirtIOIOMMUEndpoint *iter, *tmp; + + QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) { + virtio_iommu_detach_endpoint_from_domain(iter); + } + g_tree_destroy(domain->mappings); + trace_virtio_iommu_put_domain(domain->id); + g_free(domain); +} + +static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, + int devfn) +{ + VirtIOIOMMU *s = opaque; + IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); + static uint32_t mr_index; + IOMMUDevice *sdev; + + if (!sbus) { + sbus = g_malloc0(sizeof(IOMMUPciBus) + + sizeof(IOMMUDevice *) * PCI_DEVFN_MAX); + sbus->bus = bus; + g_hash_table_insert(s->as_by_busptr, bus, sbus); + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + char *name = g_strdup_printf("%s-%d-%d", + TYPE_VIRTIO_IOMMU_MEMORY_REGION, + mr_index++, devfn); + sdev = sbus->pbdev[devfn] = g_new0(IOMMUDevice, 1); + + sdev->viommu = s; + sdev->bus = bus; + sdev->devfn = devfn; + + trace_virtio_iommu_init_iommu_mr(name); + + memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); + address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU); + + /* + * Build the IOMMU disabled container with aliases to the + * shared MRs. Note that aliasing to a shared memory region + * could help the memory API to detect same FlatViews so we + * can have devices to share the same FlatView when in bypass + * mode. (either by not configuring virtio-iommu driver or with + * "iommu=pt"). It will greatly reduce the total number of + * FlatViews of the system hence VM runs faster. + */ + memory_region_init_alias(&sdev->bypass_mr, OBJECT(s), + "system", get_system_memory(), 0, + memory_region_size(get_system_memory())); + + memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr), + TYPE_VIRTIO_IOMMU_MEMORY_REGION, + OBJECT(s), name, + UINT64_MAX); + + /* + * Hook both the containers under the root container, we + * switch between iommu & bypass MRs by enable/disable + * corresponding sub-containers + */ + memory_region_add_subregion_overlap(&sdev->root, 0, + MEMORY_REGION(&sdev->iommu_mr), + 0); + memory_region_add_subregion_overlap(&sdev->root, 0, + &sdev->bypass_mr, 0); + + virtio_iommu_switch_address_space(sdev); + g_free(name); + } + return &sdev->as; +} + +static int virtio_iommu_attach(VirtIOIOMMU *s, + struct virtio_iommu_req_attach *req) +{ + uint32_t domain_id = le32_to_cpu(req->domain); + uint32_t ep_id = le32_to_cpu(req->endpoint); + uint32_t flags = le32_to_cpu(req->flags); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; + IOMMUDevice *sdev; + + trace_virtio_iommu_attach(domain_id, ep_id); + + if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) { + return VIRTIO_IOMMU_S_INVAL; + } + + ep = virtio_iommu_get_endpoint(s, ep_id); + if (!ep) { + return VIRTIO_IOMMU_S_NOENT; + } + + if (ep->domain) { + VirtIOIOMMUDomain *previous_domain = ep->domain; + /* + * the device is already attached to a domain, + * detach it first + */ + virtio_iommu_detach_endpoint_from_domain(ep); + if (QLIST_EMPTY(&previous_domain->endpoint_list)) { + g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id)); + } + } + + domain = virtio_iommu_get_domain(s, domain_id, + flags & VIRTIO_IOMMU_ATTACH_F_BYPASS); + if (!domain) { + /* Incompatible bypass flag */ + return VIRTIO_IOMMU_S_INVAL; + } + QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); + + ep->domain = domain; + sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); + virtio_iommu_switch_address_space(sdev); + + /* Replay domain mappings on the associated memory region */ + g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, + ep->iommu_mr); + + return VIRTIO_IOMMU_S_OK; +} + +static int virtio_iommu_detach(VirtIOIOMMU *s, + struct virtio_iommu_req_detach *req) +{ + uint32_t domain_id = le32_to_cpu(req->domain); + uint32_t ep_id = le32_to_cpu(req->endpoint); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; + + trace_virtio_iommu_detach(domain_id, ep_id); + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id)); + if (!ep) { + return VIRTIO_IOMMU_S_NOENT; + } + + domain = ep->domain; + + if (!domain || domain->id != domain_id) { + return VIRTIO_IOMMU_S_INVAL; + } + + virtio_iommu_detach_endpoint_from_domain(ep); + + if (QLIST_EMPTY(&domain->endpoint_list)) { + g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id)); + } + return VIRTIO_IOMMU_S_OK; +} + +static int virtio_iommu_map(VirtIOIOMMU *s, + struct virtio_iommu_req_map *req) +{ + uint32_t domain_id = le32_to_cpu(req->domain); + uint64_t phys_start = le64_to_cpu(req->phys_start); + uint64_t virt_start = le64_to_cpu(req->virt_start); + uint64_t virt_end = le64_to_cpu(req->virt_end); + uint32_t flags = le32_to_cpu(req->flags); + VirtIOIOMMUDomain *domain; + VirtIOIOMMUInterval *interval; + VirtIOIOMMUMapping *mapping; + VirtIOIOMMUEndpoint *ep; + + if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) { + return VIRTIO_IOMMU_S_INVAL; + } + + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (!domain) { + return VIRTIO_IOMMU_S_NOENT; + } + + if (domain->bypass) { + return VIRTIO_IOMMU_S_INVAL; + } + + interval = g_malloc0(sizeof(*interval)); + + interval->low = virt_start; + interval->high = virt_end; + + mapping = g_tree_lookup(domain->mappings, (gpointer)interval); + if (mapping) { + g_free(interval); + return VIRTIO_IOMMU_S_INVAL; + } + + trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags); + + mapping = g_malloc0(sizeof(*mapping)); + mapping->phys_addr = phys_start; + mapping->flags = flags; + + g_tree_insert(domain->mappings, interval, mapping); + + QLIST_FOREACH(ep, &domain->endpoint_list, next) { + virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start, + flags); + } + + return VIRTIO_IOMMU_S_OK; +} + +static int virtio_iommu_unmap(VirtIOIOMMU *s, + struct virtio_iommu_req_unmap *req) +{ + uint32_t domain_id = le32_to_cpu(req->domain); + uint64_t virt_start = le64_to_cpu(req->virt_start); + uint64_t virt_end = le64_to_cpu(req->virt_end); + VirtIOIOMMUMapping *iter_val; + VirtIOIOMMUInterval interval, *iter_key; + VirtIOIOMMUDomain *domain; + VirtIOIOMMUEndpoint *ep; + int ret = VIRTIO_IOMMU_S_OK; + + trace_virtio_iommu_unmap(domain_id, virt_start, virt_end); + + domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); + if (!domain) { + return VIRTIO_IOMMU_S_NOENT; + } + + if (domain->bypass) { + return VIRTIO_IOMMU_S_INVAL; + } + + interval.low = virt_start; + interval.high = virt_end; + + while (g_tree_lookup_extended(domain->mappings, &interval, + (void **)&iter_key, (void**)&iter_val)) { + uint64_t current_low = iter_key->low; + uint64_t current_high = iter_key->high; + + if (interval.low <= current_low && interval.high >= current_high) { + QLIST_FOREACH(ep, &domain->endpoint_list, next) { + virtio_iommu_notify_unmap(ep->iommu_mr, current_low, + current_high); + } + g_tree_remove(domain->mappings, iter_key); + trace_virtio_iommu_unmap_done(domain_id, current_low, current_high); + } else { + ret = VIRTIO_IOMMU_S_RANGE; + break; + } + } + return ret; +} + +static ssize_t virtio_iommu_fill_resv_mem_prop(VirtIOIOMMU *s, uint32_t ep, + uint8_t *buf, size_t free) +{ + struct virtio_iommu_probe_resv_mem prop = {}; + size_t size = sizeof(prop), length = size - sizeof(prop.head), total; + int i; + + total = size * s->nb_reserved_regions; + + if (total > free) { + return -ENOSPC; + } + + for (i = 0; i < s->nb_reserved_regions; i++) { + unsigned subtype = s->reserved_regions[i].type; + + assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED || + subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI); + prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM); + prop.head.length = cpu_to_le16(length); + prop.subtype = subtype; + prop.start = cpu_to_le64(s->reserved_regions[i].low); + prop.end = cpu_to_le64(s->reserved_regions[i].high); + + memcpy(buf, &prop, size); + + trace_virtio_iommu_fill_resv_property(ep, prop.subtype, + prop.start, prop.end); + buf += size; + } + return total; +} + +/** + * virtio_iommu_probe - Fill the probe request buffer with + * the properties the device is able to return + */ +static int virtio_iommu_probe(VirtIOIOMMU *s, + struct virtio_iommu_req_probe *req, + uint8_t *buf) +{ + uint32_t ep_id = le32_to_cpu(req->endpoint); + size_t free = VIOMMU_PROBE_SIZE; + ssize_t count; + + if (!virtio_iommu_mr(s, ep_id)) { + return VIRTIO_IOMMU_S_NOENT; + } + + count = virtio_iommu_fill_resv_mem_prop(s, ep_id, buf, free); + if (count < 0) { + return VIRTIO_IOMMU_S_INVAL; + } + buf += count; + free -= count; + + return VIRTIO_IOMMU_S_OK; +} + +static int virtio_iommu_iov_to_req(struct iovec *iov, + unsigned int iov_cnt, + void *req, size_t payload_sz) +{ + size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz); + + if (unlikely(sz != payload_sz)) { + return VIRTIO_IOMMU_S_INVAL; + } + return 0; +} + +#define virtio_iommu_handle_req(__req) \ +static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \ + struct iovec *iov, \ + unsigned int iov_cnt) \ +{ \ + struct virtio_iommu_req_ ## __req req; \ + int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \ + sizeof(req) - sizeof(struct virtio_iommu_req_tail));\ + \ + return ret ? ret : virtio_iommu_ ## __req(s, &req); \ +} + +virtio_iommu_handle_req(attach) +virtio_iommu_handle_req(detach) +virtio_iommu_handle_req(map) +virtio_iommu_handle_req(unmap) + +static int virtio_iommu_handle_probe(VirtIOIOMMU *s, + struct iovec *iov, + unsigned int iov_cnt, + uint8_t *buf) +{ + struct virtio_iommu_req_probe req; + int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req)); + + return ret ? ret : virtio_iommu_probe(s, &req, buf); +} + +static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + struct virtio_iommu_req_head head; + struct virtio_iommu_req_tail tail = {}; + size_t output_size = sizeof(tail), sz; + VirtQueueElement *elem; + unsigned int iov_cnt; + struct iovec *iov; + void *buf = NULL; + + for (;;) { + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return; + } + + if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) || + iov_size(elem->out_sg, elem->out_num) < sizeof(head)) { + virtio_error(vdev, "virtio-iommu bad head/tail size"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + break; + } + + iov_cnt = elem->out_num; + iov = elem->out_sg; + sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head)); + if (unlikely(sz != sizeof(head))) { + tail.status = VIRTIO_IOMMU_S_DEVERR; + goto out; + } + qemu_rec_mutex_lock(&s->mutex); + switch (head.type) { + case VIRTIO_IOMMU_T_ATTACH: + tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_DETACH: + tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_MAP: + tail.status = virtio_iommu_handle_map(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_UNMAP: + tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt); + break; + case VIRTIO_IOMMU_T_PROBE: + { + struct virtio_iommu_req_tail *ptail; + + output_size = s->config.probe_size + sizeof(tail); + buf = g_malloc0(output_size); + + ptail = (struct virtio_iommu_req_tail *) + (buf + s->config.probe_size); + ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf); + break; + } + default: + tail.status = VIRTIO_IOMMU_S_UNSUPP; + } + qemu_rec_mutex_unlock(&s->mutex); + +out: + sz = iov_from_buf(elem->in_sg, elem->in_num, 0, + buf ? buf : &tail, output_size); + assert(sz == output_size); + + virtqueue_push(vq, elem, sz); + virtio_notify(vdev, vq); + g_free(elem); + g_free(buf); + buf = NULL; + } +} + +static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason, + int flags, uint32_t endpoint, + uint64_t address) +{ + VirtIODevice *vdev = &viommu->parent_obj; + VirtQueue *vq = viommu->event_vq; + struct virtio_iommu_fault fault; + VirtQueueElement *elem; + size_t sz; + + memset(&fault, 0, sizeof(fault)); + fault.reason = reason; + fault.flags = cpu_to_le32(flags); + fault.endpoint = cpu_to_le32(endpoint); + fault.address = cpu_to_le64(address); + + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + + if (!elem) { + error_report_once( + "no buffer available in event queue to report event"); + return; + } + + if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) { + virtio_error(vdev, "error buffer of wrong size"); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + return; + } + + sz = iov_from_buf(elem->in_sg, elem->in_num, 0, + &fault, sizeof(fault)); + assert(sz == sizeof(fault)); + + trace_virtio_iommu_report_fault(reason, flags, endpoint, address); + virtqueue_push(vq, elem, sz); + virtio_notify(vdev, vq); + g_free(elem); + +} + +static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, + IOMMUAccessFlags flag, + int iommu_idx) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMUInterval interval, *mapping_key; + VirtIOIOMMUMapping *mapping_value; + VirtIOIOMMU *s = sdev->viommu; + bool read_fault, write_fault; + VirtIOIOMMUEndpoint *ep; + uint32_t sid, flags; + bool bypass_allowed; + bool found; + int i; + + interval.low = addr; + interval.high = addr + 1; + + IOMMUTLBEntry entry = { + .target_as = &address_space_memory, + .iova = addr, + .translated_addr = addr, + .addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1, + .perm = IOMMU_NONE, + }; + + bypass_allowed = s->config.bypass; + + sid = virtio_iommu_get_bdf(sdev); + + trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag); + qemu_rec_mutex_lock(&s->mutex); + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + + if (bypass_allowed) + assert(ep && ep->domain && !ep->domain->bypass); + + if (!ep) { + if (!bypass_allowed) { + error_report_once("%s sid=%d is not known!!", __func__, sid); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); + } else { + entry.perm = flag; + } + goto unlock; + } + + for (i = 0; i < s->nb_reserved_regions; i++) { + ReservedRegion *reg = &s->reserved_regions[i]; + + if (addr >= reg->low && addr <= reg->high) { + switch (reg->type) { + case VIRTIO_IOMMU_RESV_MEM_T_MSI: + entry.perm = flag; + break; + case VIRTIO_IOMMU_RESV_MEM_T_RESERVED: + default: + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); + break; + } + goto unlock; + } + } + + if (!ep->domain) { + if (!bypass_allowed) { + error_report_once("%s %02x:%02x.%01x not attached to any domain", + __func__, PCI_BUS_NUM(sid), + PCI_SLOT(sid), PCI_FUNC(sid)); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); + } else { + entry.perm = flag; + } + goto unlock; + } else if (ep->domain->bypass) { + entry.perm = flag; + goto unlock; + } + + found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval), + (void **)&mapping_key, + (void **)&mapping_value); + if (!found) { + error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d", + __func__, addr, sid); + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, + VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); + goto unlock; + } + + read_fault = (flag & IOMMU_RO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ); + write_fault = (flag & IOMMU_WO) && + !(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE); + + flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0; + flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0; + if (flags) { + error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d", + __func__, addr, flag, mapping_value->flags); + flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS; + virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING, + flags | VIRTIO_IOMMU_FAULT_F_ADDRESS, + sid, addr); + goto unlock; + } + entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr; + entry.perm = flag; + trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid); + +unlock: + qemu_rec_mutex_unlock(&s->mutex); + return entry; +} + +static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + struct virtio_iommu_config *dev_config = &dev->config; + struct virtio_iommu_config *out_config = (void *)config_data; + + out_config->page_size_mask = cpu_to_le64(dev_config->page_size_mask); + out_config->input_range.start = cpu_to_le64(dev_config->input_range.start); + out_config->input_range.end = cpu_to_le64(dev_config->input_range.end); + out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start); + out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end); + out_config->probe_size = cpu_to_le32(dev_config->probe_size); + out_config->bypass = dev_config->bypass; + + trace_virtio_iommu_get_config(dev_config->page_size_mask, + dev_config->input_range.start, + dev_config->input_range.end, + dev_config->domain_range.start, + dev_config->domain_range.end, + dev_config->probe_size, + dev_config->bypass); +} + +static void virtio_iommu_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + struct virtio_iommu_config *dev_config = &dev->config; + const struct virtio_iommu_config *in_config = (void *)config_data; + + if (in_config->bypass != dev_config->bypass) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { + virtio_error(vdev, "cannot set config.bypass"); + return; + } else if (in_config->bypass != 0 && in_config->bypass != 1) { + virtio_error(vdev, "invalid config.bypass value '%u'", + in_config->bypass); + return; + } + dev_config->bypass = in_config->bypass; + virtio_iommu_switch_address_space_all(dev); + } + + trace_virtio_iommu_set_config(in_config->bypass); +} + +static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f, + Error **errp) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + + f |= dev->features; + trace_virtio_iommu_get_features(f); + return f; +} + +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data) +{ + guint ua = GPOINTER_TO_UINT(a); + guint ub = GPOINTER_TO_UINT(b); + return (ua > ub) - (ua < ub); +} + +static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data) +{ + VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value; + VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key; + IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data; + + trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high, + mapping->phys_addr); + virtio_iommu_notify_map(mr, interval->low, interval->high, + mapping->phys_addr, mapping->flags); + return false; +} + +static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMU *s = sdev->viommu; + uint32_t sid; + VirtIOIOMMUEndpoint *ep; + + sid = virtio_iommu_get_bdf(sdev); + + qemu_rec_mutex_lock(&s->mutex); + + if (!s->endpoints) { + goto unlock; + } + + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); + if (!ep || !ep->domain) { + goto unlock; + } + + g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr); + +unlock: + qemu_rec_mutex_unlock(&s->mutex); +} + +static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr, + IOMMUNotifierFlag old, + IOMMUNotifierFlag new, + Error **errp) +{ + if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) { + error_setg(errp, "Virtio-iommu does not support dev-iotlb yet"); + return -EINVAL; + } + + if (old == IOMMU_NOTIFIER_NONE) { + trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name); + } else if (new == IOMMU_NOTIFIER_NONE) { + trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name); + } + return 0; +} + +/* + * The default mask (TARGET_PAGE_MASK) is the smallest supported guest granule, + * for example 0xfffffffffffff000. When an assigned device has page size + * restrictions due to the hardware IOMMU configuration, apply this restriction + * to the mask. + */ +static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, + uint64_t new_mask, + Error **errp) +{ + IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); + VirtIOIOMMU *s = sdev->viommu; + uint64_t cur_mask = s->config.page_size_mask; + + trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask, + new_mask); + + if ((cur_mask & new_mask) == 0) { + error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 + " is incompatible with mask 0x%"PRIx64, cur_mask, new_mask); + return -1; + } + + /* + * After the machine is finalized, we can't change the mask anymore. If by + * chance the hotplugged device supports the same granule, we can still + * accept it. Having a different masks is possible but the guest will use + * sub-optimal block sizes, so warn about it. + */ + if (phase_check(PHASE_MACHINE_READY)) { + int new_granule = ctz64(new_mask); + int cur_granule = ctz64(cur_mask); + + if (new_granule != cur_granule) { + error_setg(errp, "virtio-iommu page mask 0x%"PRIx64 + " is incompatible with mask 0x%"PRIx64, cur_mask, + new_mask); + return -1; + } else if (new_mask != cur_mask) { + warn_report("virtio-iommu page mask 0x%"PRIx64 + " does not match 0x%"PRIx64, cur_mask, new_mask); + } + return 0; + } + + s->config.page_size_mask &= new_mask; + return 0; +} + +static void virtio_iommu_system_reset(void *opaque) +{ + VirtIOIOMMU *s = opaque; + + trace_virtio_iommu_system_reset(); + + /* + * config.bypass is sticky across device reset, but should be restored on + * system reset + */ + s->config.bypass = s->boot_bypass; + virtio_iommu_switch_address_space_all(s); + +} + +static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + + virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config)); + + memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num)); + + s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, + virtio_iommu_handle_command); + s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL); + + /* + * config.bypass is needed to get initial address space early, such as + * in vfio realize + */ + s->config.bypass = s->boot_bypass; + s->config.page_size_mask = TARGET_PAGE_MASK; + s->config.input_range.end = UINT64_MAX; + s->config.domain_range.end = UINT32_MAX; + s->config.probe_size = VIOMMU_PROBE_SIZE; + + virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX); + virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC); + virtio_add_feature(&s->features, VIRTIO_F_VERSION_1); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG); + + qemu_rec_mutex_init(&s->mutex); + + s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free); + + if (s->primary_bus) { + pci_setup_iommu(s->primary_bus, virtio_iommu_find_add_as, s); + } else { + error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); + } + + qemu_register_reset(virtio_iommu_system_reset, s); +} + +static void virtio_iommu_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + + qemu_unregister_reset(virtio_iommu_system_reset, s); + + g_hash_table_destroy(s->as_by_busptr); + if (s->domains) { + g_tree_destroy(s->domains); + } + if (s->endpoints) { + g_tree_destroy(s->endpoints); + } + + qemu_rec_mutex_destroy(&s->mutex); + + virtio_delete_queue(s->req_vq); + virtio_delete_queue(s->event_vq); + virtio_cleanup(vdev); +} + +static void virtio_iommu_device_reset(VirtIODevice *vdev) +{ + VirtIOIOMMU *s = VIRTIO_IOMMU(vdev); + + trace_virtio_iommu_device_reset(); + + if (s->domains) { + g_tree_destroy(s->domains); + } + if (s->endpoints) { + g_tree_destroy(s->endpoints); + } + s->domains = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, virtio_iommu_put_domain); + s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp, + NULL, NULL, virtio_iommu_put_endpoint); +} + +static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status) +{ + trace_virtio_iommu_device_status(status); +} + +static void virtio_iommu_instance_init(Object *obj) +{ +} + +#define VMSTATE_INTERVAL \ +{ \ + .name = "interval", \ + .version_id = 1, \ + .minimum_version_id = 1, \ + .fields = (VMStateField[]) { \ + VMSTATE_UINT64(low, VirtIOIOMMUInterval), \ + VMSTATE_UINT64(high, VirtIOIOMMUInterval), \ + VMSTATE_END_OF_LIST() \ + } \ +} + +#define VMSTATE_MAPPING \ +{ \ + .name = "mapping", \ + .version_id = 1, \ + .minimum_version_id = 1, \ + .fields = (VMStateField[]) { \ + VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\ + VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \ + VMSTATE_END_OF_LIST() \ + }, \ +} + +static const VMStateDescription vmstate_interval_mapping[2] = { + VMSTATE_MAPPING, /* value */ + VMSTATE_INTERVAL /* key */ +}; + +static int domain_preload(void *opaque) +{ + VirtIOIOMMUDomain *domain = opaque; + + domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp, + NULL, g_free, g_free); + return 0; +} + +static const VMStateDescription vmstate_endpoint = { + .name = "endpoint", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, VirtIOIOMMUEndpoint), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_domain = { + .name = "domain", + .version_id = 2, + .minimum_version_id = 2, + .pre_load = domain_preload, + .fields = (VMStateField[]) { + VMSTATE_UINT32(id, VirtIOIOMMUDomain), + VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1, + vmstate_interval_mapping, + VirtIOIOMMUInterval, VirtIOIOMMUMapping), + VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1, + vmstate_endpoint, VirtIOIOMMUEndpoint, next), + VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2), + VMSTATE_END_OF_LIST() + } +}; + +static gboolean reconstruct_endpoints(gpointer key, gpointer value, + gpointer data) +{ + VirtIOIOMMU *s = (VirtIOIOMMU *)data; + VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value; + VirtIOIOMMUEndpoint *iter; + IOMMUMemoryRegion *mr; + + QLIST_FOREACH(iter, &d->endpoint_list, next) { + mr = virtio_iommu_mr(s, iter->id); + assert(mr); + + iter->domain = d; + iter->iommu_mr = mr; + g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter); + } + return false; /* continue the domain traversal */ +} + +static int iommu_post_load(void *opaque, int version_id) +{ + VirtIOIOMMU *s = opaque; + + g_tree_foreach(s->domains, reconstruct_endpoints, s); + + /* + * Memory regions are dynamically turned on/off depending on + * 'config.bypass' and attached domain type if there is. After + * migration, we need to make sure the memory regions are + * still correct. + */ + virtio_iommu_switch_address_space_all(s); + return 0; +} + +static const VMStateDescription vmstate_virtio_iommu_device = { + .name = "virtio-iommu-device", + .minimum_version_id = 2, + .version_id = 2, + .post_load = iommu_post_load, + .fields = (VMStateField[]) { + VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2, + &vmstate_domain, VirtIOIOMMUDomain), + VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2), + VMSTATE_END_OF_LIST() + }, +}; + +static const VMStateDescription vmstate_virtio_iommu = { + .name = "virtio-iommu", + .minimum_version_id = 2, + .priority = MIG_PRI_IOMMU, + .version_id = 2, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static Property virtio_iommu_properties[] = { + DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *), + DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_iommu_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_iommu_properties); + dc->vmsd = &vmstate_virtio_iommu; + + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_iommu_device_realize; + vdc->unrealize = virtio_iommu_device_unrealize; + vdc->reset = virtio_iommu_device_reset; + vdc->get_config = virtio_iommu_get_config; + vdc->set_config = virtio_iommu_set_config; + vdc->get_features = virtio_iommu_get_features; + vdc->set_status = virtio_iommu_set_status; + vdc->vmsd = &vmstate_virtio_iommu_device; +} + +static void virtio_iommu_memory_region_class_init(ObjectClass *klass, + void *data) +{ + IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); + + imrc->translate = virtio_iommu_translate; + imrc->replay = virtio_iommu_replay; + imrc->notify_flag_changed = virtio_iommu_notify_flag_changed; + imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; +} + +static const TypeInfo virtio_iommu_info = { + .name = TYPE_VIRTIO_IOMMU, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOIOMMU), + .instance_init = virtio_iommu_instance_init, + .class_init = virtio_iommu_class_init, +}; + +static const TypeInfo virtio_iommu_memory_region_info = { + .parent = TYPE_IOMMU_MEMORY_REGION, + .name = TYPE_VIRTIO_IOMMU_MEMORY_REGION, + .class_init = virtio_iommu_memory_region_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_iommu_info); + type_register_static(&virtio_iommu_memory_region_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c new file mode 100644 index 00000000..5c5c1e3a --- /dev/null +++ b/hw/virtio/virtio-mem-pci.c @@ -0,0 +1,161 @@ +/* + * Virtio MEM PCI device + * + * Copyright (C) 2020 Red Hat, Inc. + * + * Authors: + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "virtio-mem-pci.h" +#include "hw/mem/memory-device.h" +#include "qapi/error.h" +#include "qapi/qapi-events-machine.h" +#include "qapi/qapi-events-misc.h" + +static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&mem_pci->vdev); + + virtio_pci_force_virtio_1(vpci_dev); + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp); +} + +static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, + &error_abort); +} + +static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); + VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem); + + return vmc->get_memory_region(vmem, errp); +} + +static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md, + Error **errp) +{ + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP, + errp); +} + +static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1); + VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md); + VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev); + VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem); + DeviceState *dev = DEVICE(md); + + if (dev->id) { + vi->has_id = true; + vi->id = g_strdup(dev->id); + } + + /* let the real device handle everything else */ + vpc->fill_device_info(vmem, vi); + + info->u.virtio_mem.data = vi; + info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM; +} + +static uint64_t virtio_mem_pci_get_min_alignment(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP, + &error_abort); +} + +static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data) +{ + VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI, + size_change_notifier); + DeviceState *dev = DEVICE(pci_mem); + char *qom_path = object_get_canonical_path(OBJECT(dev)); + const uint64_t * const size_p = data; + + qapi_event_send_memory_device_size_change(!!dev->id, dev->id, *size_p, + qom_path); + g_free(qom_path); +} + +static void virtio_mem_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); + + k->realize = virtio_mem_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; + + mdc->get_addr = virtio_mem_pci_get_addr; + mdc->set_addr = virtio_mem_pci_set_addr; + mdc->get_plugged_size = virtio_mem_pci_get_plugged_size; + mdc->get_memory_region = virtio_mem_pci_get_memory_region; + mdc->fill_device_info = virtio_mem_pci_fill_device_info; + mdc->get_min_alignment = virtio_mem_pci_get_min_alignment; +} + +static void virtio_mem_pci_instance_init(Object *obj) +{ + VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj); + VirtIOMEMClass *vmc; + VirtIOMEM *vmem; + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_MEM); + + dev->size_change_notifier.notify = virtio_mem_pci_size_change_notify; + vmem = VIRTIO_MEM(&dev->vdev); + vmc = VIRTIO_MEM_GET_CLASS(vmem); + /* + * We never remove the notifier again, as we expect both devices to + * disappear at the same time. + */ + vmc->add_size_change_notifier(vmem, &dev->size_change_notifier); + + object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, + OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP); + object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev), + VIRTIO_MEM_SIZE_PROP); + object_property_add_alias(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, + OBJECT(&dev->vdev), + VIRTIO_MEM_REQUESTED_SIZE_PROP); +} + +static const VirtioPCIDeviceTypeInfo virtio_mem_pci_info = { + .base_name = TYPE_VIRTIO_MEM_PCI, + .generic_name = "virtio-mem-pci", + .instance_size = sizeof(VirtIOMEMPCI), + .instance_init = virtio_mem_pci_instance_init, + .class_init = virtio_mem_pci_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_MEMORY_DEVICE }, + { } + }, +}; + +static void virtio_mem_pci_register_types(void) +{ + virtio_pci_types_register(&virtio_mem_pci_info); +} +type_init(virtio_mem_pci_register_types) diff --git a/hw/virtio/virtio-mem-pci.h b/hw/virtio/virtio-mem-pci.h new file mode 100644 index 00000000..e636e1a4 --- /dev/null +++ b/hw/virtio/virtio-mem-pci.h @@ -0,0 +1,35 @@ +/* + * Virtio MEM PCI device + * + * Copyright (C) 2020 Red Hat, Inc. + * + * Authors: + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_VIRTIO_MEM_PCI_H +#define QEMU_VIRTIO_MEM_PCI_H + +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-mem.h" +#include "qom/object.h" + +typedef struct VirtIOMEMPCI VirtIOMEMPCI; + +/* + * virtio-mem-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_MEM_PCI "virtio-mem-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOMEMPCI, VIRTIO_MEM_PCI, + TYPE_VIRTIO_MEM_PCI) + +struct VirtIOMEMPCI { + VirtIOPCIProxy parent_obj; + VirtIOMEM vdev; + Notifier size_change_notifier; +}; + +#endif /* QEMU_VIRTIO_MEM_PCI_H */ diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c new file mode 100644 index 00000000..ed170def --- /dev/null +++ b/hw/virtio/virtio-mem.c @@ -0,0 +1,1386 @@ +/* + * Virtio MEM device + * + * Copyright (C) 2020 Red Hat, Inc. + * + * Authors: + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include "qemu/units.h" +#include "sysemu/numa.h" +#include "sysemu/sysemu.h" +#include "sysemu/reset.h" +#include "hw/virtio/virtio.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/virtio/virtio-access.h" +#include "hw/virtio/virtio-mem.h" +#include "qapi/error.h" +#include "qapi/visitor.h" +#include "exec/ram_addr.h" +#include "migration/misc.h" +#include "hw/boards.h" +#include "hw/qdev-properties.h" +#include CONFIG_DEVICES +#include "trace.h" + +/* + * We only had legacy x86 guests that did not support + * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests. + */ +#if defined(TARGET_X86_64) || defined(TARGET_I386) +#define VIRTIO_MEM_HAS_LEGACY_GUESTS +#endif + +/* + * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking + * bitmap small. + */ +#define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB)) + +static uint32_t virtio_mem_default_thp_size(void) +{ + uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE; + +#if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__) + default_thp_size = 2 * MiB; +#elif defined(__aarch64__) + if (qemu_real_host_page_size() == 4 * KiB) { + default_thp_size = 2 * MiB; + } else if (qemu_real_host_page_size() == 16 * KiB) { + default_thp_size = 32 * MiB; + } else if (qemu_real_host_page_size() == 64 * KiB) { + default_thp_size = 512 * MiB; + } +#endif + + return default_thp_size; +} + +/* + * We want to have a reasonable default block size such that + * 1. We avoid splitting THPs when unplugging memory, which degrades + * performance. + * 2. We avoid placing THPs for plugged blocks that also cover unplugged + * blocks. + * + * The actual THP size might differ between Linux kernels, so we try to probe + * it. In the future (if we ever run into issues regarding 2.), we might want + * to disable THP in case we fail to properly probe the THP size, or if the + * block size is configured smaller than the THP size. + */ +static uint32_t thp_size; + +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +static uint32_t virtio_mem_thp_size(void) +{ + gchar *content = NULL; + const char *endptr; + uint64_t tmp; + + if (thp_size) { + return thp_size; + } + + /* + * Try to probe the actual THP size, fallback to (sane but eventually + * incorrect) default sizes. + */ + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && + !qemu_strtou64(content, &endptr, 0, &tmp) && + (!endptr || *endptr == '\n')) { + /* Sanity-check the value and fallback to something reasonable. */ + if (!tmp || !is_power_of_2(tmp)) { + warn_report("Read unsupported THP size: %" PRIx64, tmp); + } else { + thp_size = tmp; + } + } + + if (!thp_size) { + thp_size = virtio_mem_default_thp_size(); + warn_report("Could not detect THP size, falling back to %" PRIx64 + " MiB.", thp_size / MiB); + } + + g_free(content); + return thp_size; +} + +static uint64_t virtio_mem_default_block_size(RAMBlock *rb) +{ + const uint64_t page_size = qemu_ram_pagesize(rb); + + /* We can have hugetlbfs with a page size smaller than the THP size. */ + if (page_size == qemu_real_host_page_size()) { + return MAX(page_size, virtio_mem_thp_size()); + } + return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE); +} + +#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS) +static bool virtio_mem_has_shared_zeropage(RAMBlock *rb) +{ + /* + * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE + * anonymous RAM. In any other case, reading unplugged *can* populate a + * fresh page, consuming actual memory. + */ + return !qemu_ram_is_shared(rb) && rb->fd < 0 && + qemu_ram_pagesize(rb) == qemu_real_host_page_size(); +} +#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */ + +/* + * Size the usable region bigger than the requested size if possible. Esp. + * Linux guests will only add (aligned) memory blocks in case they fully + * fit into the usable region, but plug+online only a subset of the pages. + * The memory block size corresponds mostly to the section size. + * + * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and + * a section size of 512MB on arm64 (as long as the start address is properly + * aligned, similar to ordinary DIMMs). + * + * We can change this at any time and maybe even make it configurable if + * necessary (as the section size can change). But it's more likely that the + * section size will rather get smaller and not bigger over time. + */ +#if defined(TARGET_X86_64) || defined(TARGET_I386) +#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB)) +#elif defined(TARGET_ARM) +#define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB)) +#else +#error VIRTIO_MEM_USABLE_EXTENT not defined +#endif + +static bool virtio_mem_is_busy(void) +{ + /* + * Postcopy cannot handle concurrent discards and we don't want to migrate + * pages on-demand with stale content when plugging new blocks. + * + * For precopy, we don't want unplugged blocks in our migration stream, and + * when plugging new blocks, the page content might differ between source + * and destination (observable by the guest when not initializing pages + * after plugging them) until we're running on the destination (as we didn't + * migrate these blocks when they were unplugged). + */ + return migration_in_incoming_postcopy() || !migration_is_idle(); +} + +typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size); + +static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, + virtio_mem_range_cb cb) +{ + unsigned long first_zero_bit, last_zero_bit; + uint64_t offset, size; + int ret = 0; + + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); + while (first_zero_bit < vmem->bitmap_size) { + offset = first_zero_bit * vmem->block_size; + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + first_zero_bit + 1) - 1; + size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + + ret = cb(vmem, arg, offset, size); + if (ret) { + break; + } + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + last_zero_bit + 2); + } + return ret; +} + +/* + * Adjust the memory section to cover the intersection with the given range. + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static bool virito_mem_intersect_memory_section(MemoryRegionSection *s, + uint64_t offset, uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), + offset + size); + + if (end <= start) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_make64(end - start); + return true; +} + +typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); + +static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, + MemoryRegionSection *s, + void *arg, + virtio_mem_section_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit); + while (first_bit < vmem->bitmap_size) { + MemoryRegionSection tmp = *s; + + offset = first_bit * vmem->block_size; + last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + break; + } + ret = cb(&tmp, arg); + if (ret) { + break; + } + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + +static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem, + MemoryRegionSection *s, + void *arg, + virtio_mem_section_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit); + while (first_bit < vmem->bitmap_size) { + MemoryRegionSection tmp = *s; + + offset = first_bit * vmem->block_size; + last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + break; + } + ret = cb(&tmp, arg); + if (ret) { + break; + } + first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + +static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + return rdl->notify_populate(rdl, s); +} + +static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + rdl->notify_discard(rdl, s); + return 0; +} + +static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl->notify_discard(rdl, &tmp); + } +} + +static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl, *rdl2; + int ret = 0; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + ret = rdl->notify_populate(rdl, &tmp); + if (ret) { + break; + } + } + + if (ret) { + /* Notify all already-notified listeners. */ + QLIST_FOREACH(rdl2, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (rdl2 == rdl) { + break; + } + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl2->notify_discard(rdl2, &tmp); + } + } + return ret; +} + +static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem) +{ + RamDiscardListener *rdl; + + if (!vmem->size) { + return; + } + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } +} + +static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa, + uint64_t size, bool plugged) +{ + const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; + const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1; + unsigned long found_bit; + + /* We fake a shorter bitmap to avoid searching too far. */ + if (plugged) { + found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit); + } else { + found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit); + } + return found_bit > last_bit; +} + +static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, + uint64_t size, bool plugged) +{ + const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size; + const unsigned long nbits = size / vmem->block_size; + + if (plugged) { + bitmap_set(vmem->bitmap, bit, nbits); + } else { + bitmap_clear(vmem->bitmap, bit, nbits); + } +} + +static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem, + struct virtio_mem_resp *resp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vmem); + VirtQueue *vq = vmem->vq; + + trace_virtio_mem_send_response(le16_to_cpu(resp->type)); + iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp)); + + virtqueue_push(vq, elem, sizeof(*resp)); + virtio_notify(vdev, vq); +} + +static void virtio_mem_send_response_simple(VirtIOMEM *vmem, + VirtQueueElement *elem, + uint16_t type) +{ + struct virtio_mem_resp resp = { + .type = cpu_to_le16(type), + }; + + virtio_mem_send_response(vmem, elem, &resp); +} + +static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa, + uint64_t size) +{ + if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { + return false; + } + if (gpa + size < gpa || !size) { + return false; + } + if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) { + return false; + } + if (gpa + size > vmem->addr + vmem->usable_region_size) { + return false; + } + return true; +} + +static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, + uint64_t size, bool plug) +{ + const uint64_t offset = start_gpa - vmem->addr; + RAMBlock *rb = vmem->memdev->mr.ram_block; + + if (virtio_mem_is_busy()) { + return -EBUSY; + } + + if (!plug) { + if (ram_block_discard_range(rb, offset, size)) { + return -EBUSY; + } + virtio_mem_notify_unplug(vmem, offset, size); + } else { + int ret = 0; + + if (vmem->prealloc) { + void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset; + int fd = memory_region_get_fd(&vmem->memdev->mr); + Error *local_err = NULL; + + qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err); + if (local_err) { + static bool warned; + + /* + * Warn only once, we don't want to fill the log with these + * warnings. + */ + if (!warned) { + warn_report_err(local_err); + warned = true; + } else { + error_free(local_err); + } + ret = -EBUSY; + } + } + if (!ret) { + ret = virtio_mem_notify_plug(vmem, offset, size); + } + + if (ret) { + /* Could be preallocation or a notifier populated memory. */ + ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); + return -EBUSY; + } + } + virtio_mem_set_bitmap(vmem, start_gpa, size, plug); + return 0; +} + +static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa, + uint16_t nb_blocks, bool plug) +{ + const uint64_t size = nb_blocks * vmem->block_size; + int ret; + + if (!virtio_mem_valid_range(vmem, gpa, size)) { + return VIRTIO_MEM_RESP_ERROR; + } + + if (plug && (vmem->size + size > vmem->requested_size)) { + return VIRTIO_MEM_RESP_NACK; + } + + /* test if really all blocks are in the opposite state */ + if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) { + return VIRTIO_MEM_RESP_ERROR; + } + + ret = virtio_mem_set_block_state(vmem, gpa, size, plug); + if (ret) { + return VIRTIO_MEM_RESP_BUSY; + } + if (plug) { + vmem->size += size; + } else { + vmem->size -= size; + } + notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); + return VIRTIO_MEM_RESP_ACK; +} + +static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem, + struct virtio_mem_req *req) +{ + const uint64_t gpa = le64_to_cpu(req->u.plug.addr); + const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks); + uint16_t type; + + trace_virtio_mem_plug_request(gpa, nb_blocks); + type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true); + virtio_mem_send_response_simple(vmem, elem, type); +} + +static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem, + struct virtio_mem_req *req) +{ + const uint64_t gpa = le64_to_cpu(req->u.unplug.addr); + const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks); + uint16_t type; + + trace_virtio_mem_unplug_request(gpa, nb_blocks); + type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false); + virtio_mem_send_response_simple(vmem, elem, type); +} + +static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, + uint64_t requested_size, + bool can_shrink) +{ + uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr), + requested_size + VIRTIO_MEM_USABLE_EXTENT); + + /* The usable region size always has to be multiples of the block size. */ + newsize = QEMU_ALIGN_UP(newsize, vmem->block_size); + + if (!requested_size) { + newsize = 0; + } + + if (newsize < vmem->usable_region_size && !can_shrink) { + return; + } + + trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize); + vmem->usable_region_size = newsize; +} + +static int virtio_mem_unplug_all(VirtIOMEM *vmem) +{ + RAMBlock *rb = vmem->memdev->mr.ram_block; + + if (virtio_mem_is_busy()) { + return -EBUSY; + } + + if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) { + return -EBUSY; + } + virtio_mem_notify_unplug_all(vmem); + + bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); + if (vmem->size) { + vmem->size = 0; + notifier_list_notify(&vmem->size_change_notifiers, &vmem->size); + } + trace_virtio_mem_unplugged_all(); + virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); + return 0; +} + +static void virtio_mem_unplug_all_request(VirtIOMEM *vmem, + VirtQueueElement *elem) +{ + trace_virtio_mem_unplug_all_request(); + if (virtio_mem_unplug_all(vmem)) { + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY); + } else { + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK); + } +} + +static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem, + struct virtio_mem_req *req) +{ + const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks); + const uint64_t gpa = le64_to_cpu(req->u.state.addr); + const uint64_t size = nb_blocks * vmem->block_size; + struct virtio_mem_resp resp = { + .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK), + }; + + trace_virtio_mem_state_request(gpa, nb_blocks); + if (!virtio_mem_valid_range(vmem, gpa, size)) { + virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR); + return; + } + + if (virtio_mem_test_bitmap(vmem, gpa, size, true)) { + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED); + } else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) { + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED); + } else { + resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED); + } + trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state)); + virtio_mem_send_response(vmem, elem, &resp); +} + +static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq) +{ + const int len = sizeof(struct virtio_mem_req); + VirtIOMEM *vmem = VIRTIO_MEM(vdev); + VirtQueueElement *elem; + struct virtio_mem_req req; + uint16_t type; + + while (true) { + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return; + } + + if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) { + virtio_error(vdev, "virtio-mem protocol violation: invalid request" + " size: %d", len); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + return; + } + + if (iov_size(elem->in_sg, elem->in_num) < + sizeof(struct virtio_mem_resp)) { + virtio_error(vdev, "virtio-mem protocol violation: not enough space" + " for response: %zu", + iov_size(elem->in_sg, elem->in_num)); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + return; + } + + type = le16_to_cpu(req.type); + switch (type) { + case VIRTIO_MEM_REQ_PLUG: + virtio_mem_plug_request(vmem, elem, &req); + break; + case VIRTIO_MEM_REQ_UNPLUG: + virtio_mem_unplug_request(vmem, elem, &req); + break; + case VIRTIO_MEM_REQ_UNPLUG_ALL: + virtio_mem_unplug_all_request(vmem, elem); + break; + case VIRTIO_MEM_REQ_STATE: + virtio_mem_state_request(vmem, elem, &req); + break; + default: + virtio_error(vdev, "virtio-mem protocol violation: unknown request" + " type: %d", type); + virtqueue_detach_element(vq, elem, 0); + g_free(elem); + return; + } + + g_free(elem); + } +} + +static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data) +{ + VirtIOMEM *vmem = VIRTIO_MEM(vdev); + struct virtio_mem_config *config = (void *) config_data; + + config->block_size = cpu_to_le64(vmem->block_size); + config->node_id = cpu_to_le16(vmem->node); + config->requested_size = cpu_to_le64(vmem->requested_size); + config->plugged_size = cpu_to_le64(vmem->size); + config->addr = cpu_to_le64(vmem->addr); + config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr)); + config->usable_region_size = cpu_to_le64(vmem->usable_region_size); +} + +static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features, + Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + VirtIOMEM *vmem = VIRTIO_MEM(vdev); + + if (ms->numa_state) { +#if defined(CONFIG_ACPI) + virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM); +#endif + } + assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO); + if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) { + virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE); + } + return features; +} + +static int virtio_mem_validate_features(VirtIODevice *vdev) +{ + if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) && + !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) { + return -EFAULT; + } + return 0; +} + +static void virtio_mem_system_reset(void *opaque) +{ + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + + /* + * During usual resets, we will unplug all memory and shrink the usable + * region size. This is, however, not possible in all scenarios. Then, + * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL). + */ + virtio_mem_unplug_all(vmem); +} + +static void virtio_mem_device_realize(DeviceState *dev, Error **errp) +{ + MachineState *ms = MACHINE(qdev_get_machine()); + int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOMEM *vmem = VIRTIO_MEM(dev); + uint64_t page_size; + RAMBlock *rb; + int ret; + + if (!vmem->memdev) { + error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP); + return; + } else if (host_memory_backend_is_mapped(vmem->memdev)) { + error_setg(errp, "'%s' property specifies a busy memdev: %s", + VIRTIO_MEM_MEMDEV_PROP, + object_get_canonical_path_component(OBJECT(vmem->memdev))); + return; + } else if (!memory_region_is_ram(&vmem->memdev->mr) || + memory_region_is_rom(&vmem->memdev->mr) || + !vmem->memdev->mr.ram_block) { + error_setg(errp, "'%s' property specifies an unsupported memdev", + VIRTIO_MEM_MEMDEV_PROP); + return; + } + + if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) || + (!nb_numa_nodes && vmem->node)) { + error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds" + "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP, + vmem->node, nb_numa_nodes ? nb_numa_nodes : 1); + return; + } + + if (enable_mlock) { + error_setg(errp, "Incompatible with mlock"); + return; + } + + rb = vmem->memdev->mr.ram_block; + page_size = qemu_ram_pagesize(rb); + +#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS) + switch (vmem->unplugged_inaccessible) { + case ON_OFF_AUTO_AUTO: + if (virtio_mem_has_shared_zeropage(rb)) { + vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF; + } else { + vmem->unplugged_inaccessible = ON_OFF_AUTO_ON; + } + break; + case ON_OFF_AUTO_OFF: + if (!virtio_mem_has_shared_zeropage(rb)) { + warn_report("'%s' property set to 'off' with a memdev that does" + " not support the shared zeropage.", + VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP); + } + break; + default: + break; + } +#else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */ + vmem->unplugged_inaccessible = ON_OFF_AUTO_ON; +#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */ + + /* + * If the block size wasn't configured by the user, use a sane default. This + * allows using hugetlbfs backends of any page size without manual + * intervention. + */ + if (!vmem->block_size) { + vmem->block_size = virtio_mem_default_block_size(rb); + } + + if (vmem->block_size < page_size) { + error_setg(errp, "'%s' property has to be at least the page size (0x%" + PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size); + return; + } else if (vmem->block_size < virtio_mem_default_block_size(rb)) { + warn_report("'%s' property is smaller than the default block size (%" + PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP, + virtio_mem_default_block_size(rb) / MiB); + } + if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) { + error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 + ")", VIRTIO_MEM_REQUESTED_SIZE_PROP, + VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); + return; + } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) { + error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64 + ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP, + vmem->block_size); + return; + } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr), + vmem->block_size)) { + error_setg(errp, "'%s' property memdev size has to be multiples of" + "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP, + VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size); + return; + } + + if (ram_block_coordinated_discard_require(true)) { + error_setg(errp, "Discarding RAM is disabled"); + return; + } + + ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); + if (ret) { + error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); + ram_block_coordinated_discard_require(false); + return; + } + + virtio_mem_resize_usable_region(vmem, vmem->requested_size, true); + + vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) / + vmem->block_size; + vmem->bitmap = bitmap_new(vmem->bitmap_size); + + virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config)); + vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request); + + host_memory_backend_set_mapped(vmem->memdev, true); + vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); + qemu_register_reset(virtio_mem_system_reset, vmem); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the + * memory region and exposes it via an address space. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, + RAM_DISCARD_MANAGER(vmem)); +} + +static void virtio_mem_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOMEM *vmem = VIRTIO_MEM(dev); + + /* + * The unplug handler unmapped the memory region, it cannot be + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); + qemu_unregister_reset(virtio_mem_system_reset, vmem); + vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); + host_memory_backend_set_mapped(vmem->memdev, false); + virtio_del_queue(vdev, 0); + virtio_cleanup(vdev); + g_free(vmem->bitmap); + ram_block_coordinated_discard_require(false); +} + +static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) +{ + RAMBlock *rb = vmem->memdev->mr.ram_block; + + return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0; +} + +static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +{ + /* Make sure all memory is really discarded after migration. */ + return virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_discard_range_cb); +} + +static int virtio_mem_post_load(void *opaque, int version_id) +{ + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + RamDiscardListener *rdl; + int ret; + + /* + * We started out with all memory discarded and our memory region is mapped + * into an address space. Replay, now that we updated the bitmap. + */ + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + return ret; + } + } + + if (migration_in_incoming_postcopy()) { + return 0; + } + + return virtio_mem_restore_unplugged(vmem); +} + +typedef struct VirtIOMEMMigSanityChecks { + VirtIOMEM *parent; + uint64_t addr; + uint64_t region_size; + uint64_t block_size; + uint32_t node; +} VirtIOMEMMigSanityChecks; + +static int virtio_mem_mig_sanity_checks_pre_save(void *opaque) +{ + VirtIOMEMMigSanityChecks *tmp = opaque; + VirtIOMEM *vmem = tmp->parent; + + tmp->addr = vmem->addr; + tmp->region_size = memory_region_size(&vmem->memdev->mr); + tmp->block_size = vmem->block_size; + tmp->node = vmem->node; + return 0; +} + +static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id) +{ + VirtIOMEMMigSanityChecks *tmp = opaque; + VirtIOMEM *vmem = tmp->parent; + const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr); + + if (tmp->addr != vmem->addr) { + error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, + VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr); + return -EINVAL; + } + /* + * Note: Preparation for resizeable memory regions. The maximum size + * of the memory region must not change during migration. + */ + if (tmp->region_size != new_region_size) { + error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%" + PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size, + new_region_size); + return -EINVAL; + } + if (tmp->block_size != vmem->block_size) { + error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64, + VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size, + vmem->block_size); + return -EINVAL; + } + if (tmp->node != vmem->node) { + error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32, + VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node); + return -EINVAL; + } + return 0; +} + +static const VMStateDescription vmstate_virtio_mem_sanity_checks = { + .name = "virtio-mem-device/sanity-checks", + .pre_save = virtio_mem_mig_sanity_checks_pre_save, + .post_load = virtio_mem_mig_sanity_checks_post_load, + .fields = (VMStateField[]) { + VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks), + VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks), + VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks), + VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks), + VMSTATE_END_OF_LIST(), + }, +}; + +static const VMStateDescription vmstate_virtio_mem_device = { + .name = "virtio-mem-device", + .minimum_version_id = 1, + .version_id = 1, + .priority = MIG_PRI_VIRTIO_MEM, + .post_load = virtio_mem_post_load, + .fields = (VMStateField[]) { + VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, + vmstate_virtio_mem_sanity_checks), + VMSTATE_UINT64(usable_region_size, VirtIOMEM), + VMSTATE_UINT64(size, VirtIOMEM), + VMSTATE_UINT64(requested_size, VirtIOMEM), + VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size), + VMSTATE_END_OF_LIST() + }, +}; + +static const VMStateDescription vmstate_virtio_mem = { + .name = "virtio-mem", + .minimum_version_id = 1, + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static void virtio_mem_fill_device_info(const VirtIOMEM *vmem, + VirtioMEMDeviceInfo *vi) +{ + vi->memaddr = vmem->addr; + vi->node = vmem->node; + vi->requested_size = vmem->requested_size; + vi->size = vmem->size; + vi->max_size = memory_region_size(&vmem->memdev->mr); + vi->block_size = vmem->block_size; + vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev)); +} + +static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp) +{ + if (!vmem->memdev) { + error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP); + return NULL; + } + + return &vmem->memdev->mr; +} + +static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem, + Notifier *notifier) +{ + notifier_list_add(&vmem->size_change_notifiers, notifier); +} + +static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem, + Notifier *notifier) +{ + notifier_remove(notifier); +} + +static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(obj); + uint64_t value = vmem->size; + + visit_type_size(v, name, &value, errp); +} + +static void virtio_mem_get_requested_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(obj); + uint64_t value = vmem->requested_size; + + visit_type_size(v, name, &value, errp); +} + +static void virtio_mem_set_requested_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + VirtIOMEM *vmem = VIRTIO_MEM(obj); + Error *err = NULL; + uint64_t value; + + visit_type_size(v, name, &value, &err); + if (err) { + error_propagate(errp, err); + return; + } + + /* + * The block size and memory backend are not fixed until the device was + * realized. realize() will verify these properties then. + */ + if (DEVICE(obj)->realized) { + if (!QEMU_IS_ALIGNED(value, vmem->block_size)) { + error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64 + ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP, + vmem->block_size); + return; + } else if (value > memory_region_size(&vmem->memdev->mr)) { + error_setg(errp, "'%s' cannot exceed the memory backend size" + "(0x%" PRIx64 ")", name, + memory_region_size(&vmem->memdev->mr)); + return; + } + + if (value != vmem->requested_size) { + virtio_mem_resize_usable_region(vmem, value, false); + vmem->requested_size = value; + } + /* + * Trigger a config update so the guest gets notified. We trigger + * even if the size didn't change (especially helpful for debugging). + */ + virtio_notify_config(VIRTIO_DEVICE(vmem)); + } else { + vmem->requested_size = value; + } +} + +static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(obj); + uint64_t value = vmem->block_size; + + /* + * If not configured by the user (and we're not realized yet), use the + * default block size we would use with the current memory backend. + */ + if (!value) { + if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) { + value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block); + } else { + value = virtio_mem_thp_size(); + } + } + + visit_type_size(v, name, &value, errp); +} + +static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + VirtIOMEM *vmem = VIRTIO_MEM(obj); + Error *err = NULL; + uint64_t value; + + if (DEVICE(obj)->realized) { + error_setg(errp, "'%s' cannot be changed", name); + return; + } + + visit_type_size(v, name, &value, &err); + if (err) { + error_propagate(errp, err); + return; + } + + if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) { + error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name, + VIRTIO_MEM_MIN_BLOCK_SIZE); + return; + } else if (!is_power_of_2(value)) { + error_setg(errp, "'%s' property has to be a power of two", name); + return; + } + vmem->block_size = value; +} + +static void virtio_mem_instance_init(Object *obj) +{ + VirtIOMEM *vmem = VIRTIO_MEM(obj); + + notifier_list_init(&vmem->size_change_notifiers); + QLIST_INIT(&vmem->rdl_list); + + object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, + NULL, NULL, NULL); + object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size", + virtio_mem_get_requested_size, + virtio_mem_set_requested_size, NULL, NULL); + object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size", + virtio_mem_get_block_size, virtio_mem_set_block_size, + NULL, NULL); +} + +static Property virtio_mem_properties[] = { + DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0), + DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0), + DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false), + DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev, + TYPE_MEMORY_BACKEND, HostMemoryBackend *), +#if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS) + DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM, + unplugged_inaccessible, ON_OFF_AUTO_AUTO), +#endif + DEFINE_PROP_END_OF_LIST(), +}; + +static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(mr == &vmem->memdev->mr); + return vmem->block_size; +} + +static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *s) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + uint64_t start_gpa = vmem->addr + s->offset_within_region; + uint64_t end_gpa = start_gpa + int128_get64(s->size); + + g_assert(s->mr == &vmem->memdev->mr); + + start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size); + end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size); + + if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) { + return false; + } + + return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true); +} + +struct VirtIOMEMReplayData { + void *fn; + void *opaque; +}; + +static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) +{ + struct VirtIOMEMReplayData *data = arg; + + return ((ReplayRamPopulate)data->fn)(s, data->opaque); +} + +static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamPopulate replay_fn, + void *opaque) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + struct VirtIOMEMReplayData data = { + .fn = replay_fn, + .opaque = opaque, + }; + + g_assert(s->mr == &vmem->memdev->mr); + return virtio_mem_for_each_plugged_section(vmem, s, &data, + virtio_mem_rdm_replay_populated_cb); +} + +static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s, + void *arg) +{ + struct VirtIOMEMReplayData *data = arg; + + ((ReplayRamDiscard)data->fn)(s, data->opaque); + return 0; +} + +static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamDiscard replay_fn, + void *opaque) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + struct VirtIOMEMReplayData data = { + .fn = replay_fn, + .opaque = opaque, + }; + + g_assert(s->mr == &vmem->memdev->mr); + virtio_mem_for_each_unplugged_section(vmem, s, &data, + virtio_mem_rdm_replay_discarded_cb); +} + +static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *s) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + int ret; + + g_assert(s->mr == &vmem->memdev->mr); + rdl->section = memory_region_section_new_copy(s); + + QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next); + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + error_report("%s: Replaying plugged ranges failed: %s", __func__, + strerror(-ret)); + } +} + +static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(rdl->section->mr == &vmem->memdev->mr); + if (vmem->size) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } + + memory_region_section_free_copy(rdl->section); + rdl->section = NULL; + QLIST_REMOVE(rdl, next); +} + +static void virtio_mem_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); + + device_class_set_props(dc, virtio_mem_properties); + dc->vmsd = &vmstate_virtio_mem; + + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_mem_device_realize; + vdc->unrealize = virtio_mem_device_unrealize; + vdc->get_config = virtio_mem_get_config; + vdc->get_features = virtio_mem_get_features; + vdc->validate_features = virtio_mem_validate_features; + vdc->vmsd = &vmstate_virtio_mem_device; + + vmc->fill_device_info = virtio_mem_fill_device_info; + vmc->get_memory_region = virtio_mem_get_memory_region; + vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; + vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; + + rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity; + rdmc->is_populated = virtio_mem_rdm_is_populated; + rdmc->replay_populated = virtio_mem_rdm_replay_populated; + rdmc->replay_discarded = virtio_mem_rdm_replay_discarded; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; +} + +static const TypeInfo virtio_mem_info = { + .name = TYPE_VIRTIO_MEM, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIOMEM), + .instance_init = virtio_mem_instance_init, + .class_init = virtio_mem_class_init, + .class_size = sizeof(VirtIOMEMClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_RAM_DISCARD_MANAGER }, + { } + }, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_mem_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c new file mode 100644 index 00000000..d240efef --- /dev/null +++ b/hw/virtio/virtio-mmio.c @@ -0,0 +1,862 @@ +/* + * Virtio MMIO bindings + * + * Copyright (c) 2011 Linaro Limited + * + * Author: + * Peter Maydell <peter.maydell@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "standard-headers/linux/virtio_mmio.h" +#include "hw/irq.h" +#include "hw/qdev-properties.h" +#include "hw/sysbus.h" +#include "hw/virtio/virtio.h" +#include "migration/qemu-file-types.h" +#include "qemu/host-utils.h" +#include "qemu/module.h" +#include "sysemu/kvm.h" +#include "sysemu/replay.h" +#include "hw/virtio/virtio-mmio.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include "trace.h" + +static bool virtio_mmio_ioeventfd_enabled(DeviceState *d) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + + return (proxy->flags & VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD) != 0; +} + +static int virtio_mmio_ioeventfd_assign(DeviceState *d, + EventNotifier *notifier, + int n, bool assign) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + + if (assign) { + memory_region_add_eventfd(&proxy->iomem, VIRTIO_MMIO_QUEUE_NOTIFY, 4, + true, n, notifier); + } else { + memory_region_del_eventfd(&proxy->iomem, VIRTIO_MMIO_QUEUE_NOTIFY, 4, + true, n, notifier); + } + return 0; +} + +static void virtio_mmio_start_ioeventfd(VirtIOMMIOProxy *proxy) +{ + virtio_bus_start_ioeventfd(&proxy->bus); +} + +static void virtio_mmio_stop_ioeventfd(VirtIOMMIOProxy *proxy) +{ + virtio_bus_stop_ioeventfd(&proxy->bus); +} + +static void virtio_mmio_soft_reset(VirtIOMMIOProxy *proxy) +{ + int i; + + virtio_bus_reset(&proxy->bus); + + if (!proxy->legacy) { + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].enabled = 0; + } + } +} + +static uint64_t virtio_mmio_read(void *opaque, hwaddr offset, unsigned size) +{ + VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + trace_virtio_mmio_read(offset); + + if (!vdev) { + /* If no backend is present, we treat most registers as + * read-as-zero, except for the magic number, version and + * vendor ID. This is not strictly sanctioned by the virtio + * spec, but it allows us to provide transports with no backend + * plugged in which don't confuse Linux's virtio code: the + * probe won't complain about the bad magic number, but the + * device ID of zero means no backend will claim it. + */ + switch (offset) { + case VIRTIO_MMIO_MAGIC_VALUE: + return VIRT_MAGIC; + case VIRTIO_MMIO_VERSION: + if (proxy->legacy) { + return VIRT_VERSION_LEGACY; + } else { + return VIRT_VERSION; + } + case VIRTIO_MMIO_VENDOR_ID: + return VIRT_VENDOR; + default: + return 0; + } + } + + if (offset >= VIRTIO_MMIO_CONFIG) { + offset -= VIRTIO_MMIO_CONFIG; + if (proxy->legacy) { + switch (size) { + case 1: + return virtio_config_readb(vdev, offset); + case 2: + return virtio_config_readw(vdev, offset); + case 4: + return virtio_config_readl(vdev, offset); + default: + abort(); + } + } else { + switch (size) { + case 1: + return virtio_config_modern_readb(vdev, offset); + case 2: + return virtio_config_modern_readw(vdev, offset); + case 4: + return virtio_config_modern_readl(vdev, offset); + default: + abort(); + } + } + } + if (size != 4) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: wrong size access to register!\n", + __func__); + return 0; + } + switch (offset) { + case VIRTIO_MMIO_MAGIC_VALUE: + return VIRT_MAGIC; + case VIRTIO_MMIO_VERSION: + if (proxy->legacy) { + return VIRT_VERSION_LEGACY; + } else { + return VIRT_VERSION; + } + case VIRTIO_MMIO_DEVICE_ID: + return vdev->device_id; + case VIRTIO_MMIO_VENDOR_ID: + return VIRT_VENDOR; + case VIRTIO_MMIO_DEVICE_FEATURES: + if (proxy->legacy) { + if (proxy->host_features_sel) { + return 0; + } else { + return vdev->host_features; + } + } else { + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + return (vdev->host_features & ~vdc->legacy_features) + >> (32 * proxy->host_features_sel); + } + case VIRTIO_MMIO_QUEUE_NUM_MAX: + if (!virtio_queue_get_num(vdev, vdev->queue_sel)) { + return 0; + } + return VIRTQUEUE_MAX_SIZE; + case VIRTIO_MMIO_QUEUE_PFN: + if (!proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: read from legacy register (0x%" + HWADDR_PRIx ") in non-legacy mode\n", + __func__, offset); + return 0; + } + return virtio_queue_get_addr(vdev, vdev->queue_sel) + >> proxy->guest_page_shift; + case VIRTIO_MMIO_QUEUE_READY: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: read from non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return 0; + } + return proxy->vqs[vdev->queue_sel].enabled; + case VIRTIO_MMIO_INTERRUPT_STATUS: + return qatomic_read(&vdev->isr); + case VIRTIO_MMIO_STATUS: + return vdev->status; + case VIRTIO_MMIO_CONFIG_GENERATION: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: read from non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return 0; + } + return vdev->generation; + case VIRTIO_MMIO_SHM_LEN_LOW: + case VIRTIO_MMIO_SHM_LEN_HIGH: + /* + * VIRTIO_MMIO_SHM_SEL is unimplemented + * according to the linux driver, if region length is -1 + * the shared memory doesn't exist + */ + return -1; + case VIRTIO_MMIO_DEVICE_FEATURES_SEL: + case VIRTIO_MMIO_DRIVER_FEATURES: + case VIRTIO_MMIO_DRIVER_FEATURES_SEL: + case VIRTIO_MMIO_GUEST_PAGE_SIZE: + case VIRTIO_MMIO_QUEUE_SEL: + case VIRTIO_MMIO_QUEUE_NUM: + case VIRTIO_MMIO_QUEUE_ALIGN: + case VIRTIO_MMIO_QUEUE_NOTIFY: + case VIRTIO_MMIO_INTERRUPT_ACK: + case VIRTIO_MMIO_QUEUE_DESC_LOW: + case VIRTIO_MMIO_QUEUE_DESC_HIGH: + case VIRTIO_MMIO_QUEUE_AVAIL_LOW: + case VIRTIO_MMIO_QUEUE_AVAIL_HIGH: + case VIRTIO_MMIO_QUEUE_USED_LOW: + case VIRTIO_MMIO_QUEUE_USED_HIGH: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: read of write-only register (0x%" HWADDR_PRIx ")\n", + __func__, offset); + return 0; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: bad register offset (0x%" HWADDR_PRIx ")\n", + __func__, offset); + return 0; + } + return 0; +} + +static void virtio_mmio_write(void *opaque, hwaddr offset, uint64_t value, + unsigned size) +{ + VirtIOMMIOProxy *proxy = (VirtIOMMIOProxy *)opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + trace_virtio_mmio_write_offset(offset, value); + + if (!vdev) { + /* If no backend is present, we just make all registers + * write-ignored. This allows us to provide transports with + * no backend plugged in. + */ + return; + } + + if (offset >= VIRTIO_MMIO_CONFIG) { + offset -= VIRTIO_MMIO_CONFIG; + if (proxy->legacy) { + switch (size) { + case 1: + virtio_config_writeb(vdev, offset, value); + break; + case 2: + virtio_config_writew(vdev, offset, value); + break; + case 4: + virtio_config_writel(vdev, offset, value); + break; + default: + abort(); + } + return; + } else { + switch (size) { + case 1: + virtio_config_modern_writeb(vdev, offset, value); + break; + case 2: + virtio_config_modern_writew(vdev, offset, value); + break; + case 4: + virtio_config_modern_writel(vdev, offset, value); + break; + default: + abort(); + } + return; + } + } + if (size != 4) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: wrong size access to register!\n", + __func__); + return; + } + switch (offset) { + case VIRTIO_MMIO_DEVICE_FEATURES_SEL: + if (value) { + proxy->host_features_sel = 1; + } else { + proxy->host_features_sel = 0; + } + break; + case VIRTIO_MMIO_DRIVER_FEATURES: + if (proxy->legacy) { + if (proxy->guest_features_sel) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: attempt to write guest features with " + "guest_features_sel > 0 in legacy mode\n", + __func__); + } else { + virtio_set_features(vdev, value); + } + } else { + proxy->guest_features[proxy->guest_features_sel] = value; + } + break; + case VIRTIO_MMIO_DRIVER_FEATURES_SEL: + if (value) { + proxy->guest_features_sel = 1; + } else { + proxy->guest_features_sel = 0; + } + break; + case VIRTIO_MMIO_GUEST_PAGE_SIZE: + if (!proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to legacy register (0x%" + HWADDR_PRIx ") in non-legacy mode\n", + __func__, offset); + return; + } + proxy->guest_page_shift = ctz32(value); + if (proxy->guest_page_shift > 31) { + proxy->guest_page_shift = 0; + } + trace_virtio_mmio_guest_page(value, proxy->guest_page_shift); + break; + case VIRTIO_MMIO_QUEUE_SEL: + if (value < VIRTIO_QUEUE_MAX) { + vdev->queue_sel = value; + } + break; + case VIRTIO_MMIO_QUEUE_NUM: + trace_virtio_mmio_queue_write(value, VIRTQUEUE_MAX_SIZE); + virtio_queue_set_num(vdev, vdev->queue_sel, value); + + if (proxy->legacy) { + virtio_queue_update_rings(vdev, vdev->queue_sel); + } else { + proxy->vqs[vdev->queue_sel].num = value; + } + break; + case VIRTIO_MMIO_QUEUE_ALIGN: + if (!proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to legacy register (0x%" + HWADDR_PRIx ") in non-legacy mode\n", + __func__, offset); + return; + } + virtio_queue_set_align(vdev, vdev->queue_sel, value); + break; + case VIRTIO_MMIO_QUEUE_PFN: + if (!proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to legacy register (0x%" + HWADDR_PRIx ") in non-legacy mode\n", + __func__, offset); + return; + } + if (value == 0) { + virtio_mmio_soft_reset(proxy); + } else { + virtio_queue_set_addr(vdev, vdev->queue_sel, + value << proxy->guest_page_shift); + } + break; + case VIRTIO_MMIO_QUEUE_READY: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + if (value) { + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); + virtio_queue_set_rings(vdev, vdev->queue_sel, + ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 | + proxy->vqs[vdev->queue_sel].desc[0], + ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 | + proxy->vqs[vdev->queue_sel].avail[0], + ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 | + proxy->vqs[vdev->queue_sel].used[0]); + proxy->vqs[vdev->queue_sel].enabled = 1; + } else { + proxy->vqs[vdev->queue_sel].enabled = 0; + } + break; + case VIRTIO_MMIO_QUEUE_NOTIFY: + if (value < VIRTIO_QUEUE_MAX) { + virtio_queue_notify(vdev, value); + } + break; + case VIRTIO_MMIO_INTERRUPT_ACK: + qatomic_and(&vdev->isr, ~value); + virtio_update_irq(vdev); + break; + case VIRTIO_MMIO_STATUS: + if (!(value & VIRTIO_CONFIG_S_DRIVER_OK)) { + virtio_mmio_stop_ioeventfd(proxy); + } + + if (!proxy->legacy && (value & VIRTIO_CONFIG_S_FEATURES_OK)) { + virtio_set_features(vdev, + ((uint64_t)proxy->guest_features[1]) << 32 | + proxy->guest_features[0]); + } + + virtio_set_status(vdev, value & 0xff); + + if (value & VIRTIO_CONFIG_S_DRIVER_OK) { + virtio_mmio_start_ioeventfd(proxy); + } + + if (vdev->status == 0) { + virtio_mmio_soft_reset(proxy); + } + break; + case VIRTIO_MMIO_QUEUE_DESC_LOW: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].desc[0] = value; + break; + case VIRTIO_MMIO_QUEUE_DESC_HIGH: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].desc[1] = value; + break; + case VIRTIO_MMIO_QUEUE_AVAIL_LOW: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].avail[0] = value; + break; + case VIRTIO_MMIO_QUEUE_AVAIL_HIGH: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].avail[1] = value; + break; + case VIRTIO_MMIO_QUEUE_USED_LOW: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].used[0] = value; + break; + case VIRTIO_MMIO_QUEUE_USED_HIGH: + if (proxy->legacy) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to non-legacy register (0x%" + HWADDR_PRIx ") in legacy mode\n", + __func__, offset); + return; + } + proxy->vqs[vdev->queue_sel].used[1] = value; + break; + case VIRTIO_MMIO_MAGIC_VALUE: + case VIRTIO_MMIO_VERSION: + case VIRTIO_MMIO_DEVICE_ID: + case VIRTIO_MMIO_VENDOR_ID: + case VIRTIO_MMIO_DEVICE_FEATURES: + case VIRTIO_MMIO_QUEUE_NUM_MAX: + case VIRTIO_MMIO_INTERRUPT_STATUS: + case VIRTIO_MMIO_CONFIG_GENERATION: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: write to read-only register (0x%" HWADDR_PRIx ")\n", + __func__, offset); + break; + + default: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: bad register offset (0x%" HWADDR_PRIx ")\n", + __func__, offset); + } +} + +static const MemoryRegionOps virtio_legacy_mem_ops = { + .read = virtio_mmio_read, + .write = virtio_mmio_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static const MemoryRegionOps virtio_mem_ops = { + .read = virtio_mmio_read, + .write = virtio_mmio_write, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void virtio_mmio_update_irq(DeviceState *opaque, uint16_t vector) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + int level; + + if (!vdev) { + return; + } + level = (qatomic_read(&vdev->isr) != 0); + trace_virtio_mmio_setting_irq(level); + qemu_set_irq(proxy->irq, level); +} + +static int virtio_mmio_load_config(DeviceState *opaque, QEMUFile *f) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + + proxy->host_features_sel = qemu_get_be32(f); + proxy->guest_features_sel = qemu_get_be32(f); + proxy->guest_page_shift = qemu_get_be32(f); + return 0; +} + +static void virtio_mmio_save_config(DeviceState *opaque, QEMUFile *f) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + + qemu_put_be32(f, proxy->host_features_sel); + qemu_put_be32(f, proxy->guest_features_sel); + qemu_put_be32(f, proxy->guest_page_shift); +} + +static const VMStateDescription vmstate_virtio_mmio_queue_state = { + .name = "virtio_mmio/queue_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT16(num, VirtIOMMIOQueue), + VMSTATE_BOOL(enabled, VirtIOMMIOQueue), + VMSTATE_UINT32_ARRAY(desc, VirtIOMMIOQueue, 2), + VMSTATE_UINT32_ARRAY(avail, VirtIOMMIOQueue, 2), + VMSTATE_UINT32_ARRAY(used, VirtIOMMIOQueue, 2), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_mmio_state_sub = { + .name = "virtio_mmio/state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(guest_features, VirtIOMMIOProxy, 2), + VMSTATE_STRUCT_ARRAY(vqs, VirtIOMMIOProxy, VIRTIO_QUEUE_MAX, 0, + vmstate_virtio_mmio_queue_state, + VirtIOMMIOQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_mmio = { + .name = "virtio_mmio", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &vmstate_virtio_mmio_state_sub, + NULL + } +}; + +static void virtio_mmio_save_extra_state(DeviceState *opaque, QEMUFile *f) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + + vmstate_save_state(f, &vmstate_virtio_mmio, proxy, NULL); +} + +static int virtio_mmio_load_extra_state(DeviceState *opaque, QEMUFile *f) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + + return vmstate_load_state(f, &vmstate_virtio_mmio, proxy, 1); +} + +static bool virtio_mmio_has_extra_state(DeviceState *opaque) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(opaque); + + return !proxy->legacy; +} + +static void virtio_mmio_reset(DeviceState *d) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + int i; + + virtio_mmio_soft_reset(proxy); + + proxy->host_features_sel = 0; + proxy->guest_features_sel = 0; + proxy->guest_page_shift = 0; + + if (!proxy->legacy) { + proxy->guest_features[0] = proxy->guest_features[1] = 0; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].num = 0; + proxy->vqs[i].desc[0] = proxy->vqs[i].desc[1] = 0; + proxy->vqs[i].avail[0] = proxy->vqs[i].avail[1] = 0; + proxy->vqs[i].used[0] = proxy->vqs[i].used[1] = 0; + } + } +} + +static int virtio_mmio_set_guest_notifier(DeviceState *d, int n, bool assign, + bool with_irqfd) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + VirtQueue *vq = virtio_get_queue(vdev, n); + EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); + + if (assign) { + int r = event_notifier_init(notifier, 0); + if (r < 0) { + return r; + } + virtio_queue_set_guest_notifier_fd_handler(vq, true, with_irqfd); + } else { + virtio_queue_set_guest_notifier_fd_handler(vq, false, with_irqfd); + event_notifier_cleanup(notifier); + } + + if (vdc->guest_notifier_mask && vdev->use_guest_notifier_mask) { + vdc->guest_notifier_mask(vdev, n, !assign); + } + + return 0; +} + +static int virtio_mmio_set_guest_notifiers(DeviceState *d, int nvqs, + bool assign) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + /* TODO: need to check if kvm-arm supports irqfd */ + bool with_irqfd = false; + int r, n; + + nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX); + + for (n = 0; n < nvqs; n++) { + if (!virtio_queue_get_num(vdev, n)) { + break; + } + + r = virtio_mmio_set_guest_notifier(d, n, assign, with_irqfd); + if (r < 0) { + goto assign_error; + } + } + + return 0; + +assign_error: + /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */ + assert(assign); + while (--n >= 0) { + virtio_mmio_set_guest_notifier(d, n, !assign, false); + } + return r; +} + +static void virtio_mmio_pre_plugged(DeviceState *d, Error **errp) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (!proxy->legacy) { + virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1); + } +} + +/* virtio-mmio device */ + +static Property virtio_mmio_properties[] = { + DEFINE_PROP_BOOL("format_transport_address", VirtIOMMIOProxy, + format_transport_address, true), + DEFINE_PROP_BOOL("force-legacy", VirtIOMMIOProxy, legacy, true), + DEFINE_PROP_BIT("ioeventfd", VirtIOMMIOProxy, flags, + VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_mmio_realizefn(DeviceState *d, Error **errp) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + SysBusDevice *sbd = SYS_BUS_DEVICE(d); + + qbus_init(&proxy->bus, sizeof(proxy->bus), TYPE_VIRTIO_MMIO_BUS, d, NULL); + sysbus_init_irq(sbd, &proxy->irq); + + if (!kvm_eventfds_enabled()) { + proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; + } + + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_IOMMIO_FLAG_USE_IOEVENTFD; + } + + if (proxy->legacy) { + memory_region_init_io(&proxy->iomem, OBJECT(d), + &virtio_legacy_mem_ops, proxy, + TYPE_VIRTIO_MMIO, 0x200); + } else { + memory_region_init_io(&proxy->iomem, OBJECT(d), + &virtio_mem_ops, proxy, + TYPE_VIRTIO_MMIO, 0x200); + } + sysbus_init_mmio(sbd, &proxy->iomem); +} + +static void virtio_mmio_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = virtio_mmio_realizefn; + dc->reset = virtio_mmio_reset; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + device_class_set_props(dc, virtio_mmio_properties); +} + +static const TypeInfo virtio_mmio_info = { + .name = TYPE_VIRTIO_MMIO, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VirtIOMMIOProxy), + .class_init = virtio_mmio_class_init, +}; + +/* virtio-mmio-bus. */ + +static char *virtio_mmio_bus_get_dev_path(DeviceState *dev) +{ + BusState *virtio_mmio_bus; + VirtIOMMIOProxy *virtio_mmio_proxy; + char *proxy_path; + char *path; + MemoryRegionSection section; + + virtio_mmio_bus = qdev_get_parent_bus(dev); + virtio_mmio_proxy = VIRTIO_MMIO(virtio_mmio_bus->parent); + proxy_path = qdev_get_dev_path(DEVICE(virtio_mmio_proxy)); + + /* + * If @format_transport_address is false, then we just perform the same as + * virtio_bus_get_dev_path(): we delegate the address formatting for the + * device on the virtio-mmio bus to the bus that the virtio-mmio proxy + * (i.e., the device that implements the virtio-mmio bus) resides on. In + * this case the base address of the virtio-mmio transport will be + * invisible. + */ + if (!virtio_mmio_proxy->format_transport_address) { + return proxy_path; + } + + /* Otherwise, we append the base address of the transport. */ + section = memory_region_find(&virtio_mmio_proxy->iomem, 0, 0x200); + assert(section.mr); + + if (proxy_path) { + path = g_strdup_printf("%s/virtio-mmio@" TARGET_FMT_plx, proxy_path, + section.offset_within_address_space); + } else { + path = g_strdup_printf("virtio-mmio@" TARGET_FMT_plx, + section.offset_within_address_space); + } + memory_region_unref(section.mr); + + g_free(proxy_path); + return path; +} + +static void virtio_mmio_vmstate_change(DeviceState *d, bool running) +{ + VirtIOMMIOProxy *proxy = VIRTIO_MMIO(d); + + if (running) { + virtio_mmio_start_ioeventfd(proxy); + } else { + virtio_mmio_stop_ioeventfd(proxy); + } +} + +static void virtio_mmio_bus_class_init(ObjectClass *klass, void *data) +{ + BusClass *bus_class = BUS_CLASS(klass); + VirtioBusClass *k = VIRTIO_BUS_CLASS(klass); + + k->notify = virtio_mmio_update_irq; + k->save_config = virtio_mmio_save_config; + k->load_config = virtio_mmio_load_config; + k->save_extra_state = virtio_mmio_save_extra_state; + k->load_extra_state = virtio_mmio_load_extra_state; + k->has_extra_state = virtio_mmio_has_extra_state; + k->set_guest_notifiers = virtio_mmio_set_guest_notifiers; + k->ioeventfd_enabled = virtio_mmio_ioeventfd_enabled; + k->ioeventfd_assign = virtio_mmio_ioeventfd_assign; + k->pre_plugged = virtio_mmio_pre_plugged; + k->vmstate_change = virtio_mmio_vmstate_change; + k->has_variable_vring_alignment = true; + bus_class->max_dev = 1; + bus_class->get_dev_path = virtio_mmio_bus_get_dev_path; +} + +static const TypeInfo virtio_mmio_bus_info = { + .name = TYPE_VIRTIO_MMIO_BUS, + .parent = TYPE_VIRTIO_BUS, + .instance_size = sizeof(VirtioBusState), + .class_init = virtio_mmio_bus_class_init, +}; + +static void virtio_mmio_register_types(void) +{ + type_register_static(&virtio_mmio_bus_info); + type_register_static(&virtio_mmio_info); +} + +type_init(virtio_mmio_register_types) diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c new file mode 100644 index 00000000..e03543a7 --- /dev/null +++ b/hw/virtio/virtio-net-pci.c @@ -0,0 +1,108 @@ +/* + * Virtio net PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-net.h" +#include "hw/virtio/virtio-pci.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIONetPCI VirtIONetPCI; + +/* + * virtio-net-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_NET_PCI "virtio-net-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIONetPCI, VIRTIO_NET_PCI, + TYPE_VIRTIO_NET_PCI) + +struct VirtIONetPCI { + VirtIOPCIProxy parent_obj; + VirtIONet vdev; +}; + +static Property virtio_net_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + DeviceState *qdev = DEVICE(vpci_dev); + VirtIONetPCI *dev = VIRTIO_NET_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + VirtIONet *net = VIRTIO_NET(vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = 2 * MAX(net->nic_conf.peers.queues, 1) + + 1 /* Config interrupt */ + + 1 /* Control vq */; + } + + virtio_net_set_netclient_name(&dev->vdev, qdev->id, + object_get_typename(OBJECT(qdev))); + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_net_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass); + + k->romfile = "efi-virtio.rom"; + k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + k->device_id = PCI_DEVICE_ID_VIRTIO_NET; + k->revision = VIRTIO_PCI_ABI_VERSION; + k->class_id = PCI_CLASS_NETWORK_ETHERNET; + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + device_class_set_props(dc, virtio_net_properties); + vpciklass->realize = virtio_net_pci_realize; +} + +static void virtio_net_pci_instance_init(Object *obj) +{ + VirtIONetPCI *dev = VIRTIO_NET_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_NET); + object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev), + "bootindex"); +} + +static const VirtioPCIDeviceTypeInfo virtio_net_pci_info = { + .base_name = TYPE_VIRTIO_NET_PCI, + .generic_name = "virtio-net-pci", + .transitional_name = "virtio-net-pci-transitional", + .non_transitional_name = "virtio-net-pci-non-transitional", + .instance_size = sizeof(VirtIONetPCI), + .instance_init = virtio_net_pci_instance_init, + .class_init = virtio_net_pci_class_init, +}; + +static void virtio_net_pci_register(void) +{ + virtio_pci_types_register(&virtio_net_pci_info); +} + +type_init(virtio_net_pci_register) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c new file mode 100644 index 00000000..a1c9dfa7 --- /dev/null +++ b/hw/virtio/virtio-pci.c @@ -0,0 +1,2298 @@ +/* + * Virtio PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "exec/memop.h" +#include "standard-headers/linux/virtio_pci.h" +#include "hw/boards.h" +#include "hw/virtio/virtio.h" +#include "migration/qemu-file-types.h" +#include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include "qemu/module.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/loader.h" +#include "sysemu/kvm.h" +#include "hw/virtio/virtio-pci.h" +#include "qemu/range.h" +#include "hw/virtio/virtio-bus.h" +#include "qapi/visitor.h" +#include "sysemu/replay.h" +#include "trace.h" + +#define VIRTIO_PCI_REGION_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_present(dev)) + +#undef VIRTIO_PCI_CONFIG + +/* The remaining space is defined by each driver as the per-driver + * configuration space */ +#define VIRTIO_PCI_CONFIG_SIZE(dev) VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev)) + +static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, + VirtIOPCIProxy *dev); +static void virtio_pci_reset(DeviceState *qdev); + +/* virtio device */ +/* DeviceState to VirtIOPCIProxy. For use off data-path. TODO: use QOM. */ +static inline VirtIOPCIProxy *to_virtio_pci_proxy(DeviceState *d) +{ + return container_of(d, VirtIOPCIProxy, pci_dev.qdev); +} + +/* DeviceState to VirtIOPCIProxy. Note: used on datapath, + * be careful and test performance if you change this. + */ +static inline VirtIOPCIProxy *to_virtio_pci_proxy_fast(DeviceState *d) +{ + return container_of(d, VirtIOPCIProxy, pci_dev.qdev); +} + +static void virtio_pci_notify(DeviceState *d, uint16_t vector) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d); + + if (msix_enabled(&proxy->pci_dev)) { + if (vector != VIRTIO_NO_VECTOR) { + msix_notify(&proxy->pci_dev, vector); + } + } else { + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + pci_set_irq(&proxy->pci_dev, qatomic_read(&vdev->isr) & 1); + } +} + +static void virtio_pci_save_config(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + pci_device_save(&proxy->pci_dev, f); + msix_save(&proxy->pci_dev, f); + if (msix_present(&proxy->pci_dev)) + qemu_put_be16(f, vdev->config_vector); +} + +static const VMStateDescription vmstate_virtio_pci_modern_queue_state = { + .name = "virtio_pci/modern_queue_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT16(num, VirtIOPCIQueue), + VMSTATE_UNUSED(1), /* enabled was stored as be16 */ + VMSTATE_BOOL(enabled, VirtIOPCIQueue), + VMSTATE_UINT32_ARRAY(desc, VirtIOPCIQueue, 2), + VMSTATE_UINT32_ARRAY(avail, VirtIOPCIQueue, 2), + VMSTATE_UINT32_ARRAY(used, VirtIOPCIQueue, 2), + VMSTATE_END_OF_LIST() + } +}; + +static bool virtio_pci_modern_state_needed(void *opaque) +{ + VirtIOPCIProxy *proxy = opaque; + + return virtio_pci_modern(proxy); +} + +static const VMStateDescription vmstate_virtio_pci_modern_state_sub = { + .name = "virtio_pci/modern_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_pci_modern_state_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT32(dfselect, VirtIOPCIProxy), + VMSTATE_UINT32(gfselect, VirtIOPCIProxy), + VMSTATE_UINT32_ARRAY(guest_features, VirtIOPCIProxy, 2), + VMSTATE_STRUCT_ARRAY(vqs, VirtIOPCIProxy, VIRTIO_QUEUE_MAX, 0, + vmstate_virtio_pci_modern_queue_state, + VirtIOPCIQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_pci = { + .name = "virtio_pci", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_virtio_pci_modern_state_sub, + NULL + } +}; + +static bool virtio_pci_has_extra_state(DeviceState *d) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA; +} + +static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL); +} + +static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1); +} + +static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (msix_present(&proxy->pci_dev)) + qemu_put_be16(f, virtio_queue_vector(vdev, n)); +} + +static int virtio_pci_load_config(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint16_t vector; + + int ret; + ret = pci_device_load(&proxy->pci_dev, f); + if (ret) { + return ret; + } + msix_unuse_all_vectors(&proxy->pci_dev); + msix_load(&proxy->pci_dev, f); + if (msix_present(&proxy->pci_dev)) { + qemu_get_be16s(f, &vector); + + if (vector != VIRTIO_NO_VECTOR && vector >= proxy->nvectors) { + return -EINVAL; + } + } else { + vector = VIRTIO_NO_VECTOR; + } + vdev->config_vector = vector; + if (vector != VIRTIO_NO_VECTOR) { + msix_vector_use(&proxy->pci_dev, vector); + } + return 0; +} + +static int virtio_pci_load_queue(DeviceState *d, int n, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + uint16_t vector; + if (msix_present(&proxy->pci_dev)) { + qemu_get_be16s(f, &vector); + if (vector != VIRTIO_NO_VECTOR && vector >= proxy->nvectors) { + return -EINVAL; + } + } else { + vector = VIRTIO_NO_VECTOR; + } + virtio_queue_set_vector(vdev, n, vector); + if (vector != VIRTIO_NO_VECTOR) { + msix_vector_use(&proxy->pci_dev, vector); + } + + return 0; +} + +static bool virtio_pci_ioeventfd_enabled(DeviceState *d) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + return (proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) != 0; +} + +#define QEMU_VIRTIO_PCI_QUEUE_MEM_MULT 0x1000 + +static inline int virtio_pci_queue_mem_mult(struct VirtIOPCIProxy *proxy) +{ + return (proxy->flags & VIRTIO_PCI_FLAG_PAGE_PER_VQ) ? + QEMU_VIRTIO_PCI_QUEUE_MEM_MULT : 4; +} + +static int virtio_pci_ioeventfd_assign(DeviceState *d, EventNotifier *notifier, + int n, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtQueue *vq = virtio_get_queue(vdev, n); + bool legacy = virtio_pci_legacy(proxy); + bool modern = virtio_pci_modern(proxy); + bool fast_mmio = kvm_ioeventfd_any_length_enabled(); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + MemoryRegion *modern_mr = &proxy->notify.mr; + MemoryRegion *modern_notify_mr = &proxy->notify_pio.mr; + MemoryRegion *legacy_mr = &proxy->bar; + hwaddr modern_addr = virtio_pci_queue_mem_mult(proxy) * + virtio_get_queue_index(vq); + hwaddr legacy_addr = VIRTIO_PCI_QUEUE_NOTIFY; + + if (assign) { + if (modern) { + if (fast_mmio) { + memory_region_add_eventfd(modern_mr, modern_addr, 0, + false, n, notifier); + } else { + memory_region_add_eventfd(modern_mr, modern_addr, 2, + false, n, notifier); + } + if (modern_pio) { + memory_region_add_eventfd(modern_notify_mr, 0, 2, + true, n, notifier); + } + } + if (legacy) { + memory_region_add_eventfd(legacy_mr, legacy_addr, 2, + true, n, notifier); + } + } else { + if (modern) { + if (fast_mmio) { + memory_region_del_eventfd(modern_mr, modern_addr, 0, + false, n, notifier); + } else { + memory_region_del_eventfd(modern_mr, modern_addr, 2, + false, n, notifier); + } + if (modern_pio) { + memory_region_del_eventfd(modern_notify_mr, 0, 2, + true, n, notifier); + } + } + if (legacy) { + memory_region_del_eventfd(legacy_mr, legacy_addr, 2, + true, n, notifier); + } + } + return 0; +} + +static void virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy) +{ + virtio_bus_start_ioeventfd(&proxy->bus); +} + +static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy) +{ + virtio_bus_stop_ioeventfd(&proxy->bus); +} + +static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint16_t vector; + hwaddr pa; + + switch (addr) { + case VIRTIO_PCI_GUEST_FEATURES: + /* Guest does not negotiate properly? We have to assume nothing. */ + if (val & (1 << VIRTIO_F_BAD_FEATURE)) { + val = virtio_bus_get_vdev_bad_features(&proxy->bus); + } + virtio_set_features(vdev, val); + break; + case VIRTIO_PCI_QUEUE_PFN: + pa = (hwaddr)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT; + if (pa == 0) { + virtio_pci_reset(DEVICE(proxy)); + } + else + virtio_queue_set_addr(vdev, vdev->queue_sel, pa); + break; + case VIRTIO_PCI_QUEUE_SEL: + if (val < VIRTIO_QUEUE_MAX) + vdev->queue_sel = val; + break; + case VIRTIO_PCI_QUEUE_NOTIFY: + if (val < VIRTIO_QUEUE_MAX) { + virtio_queue_notify(vdev, val); + } + break; + case VIRTIO_PCI_STATUS: + if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) { + virtio_pci_stop_ioeventfd(proxy); + } + + virtio_set_status(vdev, val & 0xFF); + + if (val & VIRTIO_CONFIG_S_DRIVER_OK) { + virtio_pci_start_ioeventfd(proxy); + } + + if (vdev->status == 0) { + virtio_pci_reset(DEVICE(proxy)); + } + + /* Linux before 2.6.34 drives the device without enabling + the PCI device bus master bit. Enable it automatically + for the guest. This is a PCI spec violation but so is + initiating DMA with bus master bit clear. */ + if (val == (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER)) { + pci_default_write_config(&proxy->pci_dev, PCI_COMMAND, + proxy->pci_dev.config[PCI_COMMAND] | + PCI_COMMAND_MASTER, 1); + } + break; + case VIRTIO_MSI_CONFIG_VECTOR: + if (vdev->config_vector != VIRTIO_NO_VECTOR) { + msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); + } + /* Make it possible for guest to discover an error took place. */ + if (val < proxy->nvectors) { + msix_vector_use(&proxy->pci_dev, val); + } else { + val = VIRTIO_NO_VECTOR; + } + vdev->config_vector = val; + break; + case VIRTIO_MSI_QUEUE_VECTOR: + vector = virtio_queue_vector(vdev, vdev->queue_sel); + if (vector != VIRTIO_NO_VECTOR) { + msix_vector_unuse(&proxy->pci_dev, vector); + } + /* Make it possible for guest to discover an error took place. */ + if (val < proxy->nvectors) { + msix_vector_use(&proxy->pci_dev, val); + } else { + val = VIRTIO_NO_VECTOR; + } + virtio_queue_set_vector(vdev, vdev->queue_sel, val); + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, + "%s: unexpected address 0x%x value 0x%x\n", + __func__, addr, val); + break; + } +} + +static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint32_t ret = 0xFFFFFFFF; + + switch (addr) { + case VIRTIO_PCI_HOST_FEATURES: + ret = vdev->host_features; + break; + case VIRTIO_PCI_GUEST_FEATURES: + ret = vdev->guest_features; + break; + case VIRTIO_PCI_QUEUE_PFN: + ret = virtio_queue_get_addr(vdev, vdev->queue_sel) + >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; + break; + case VIRTIO_PCI_QUEUE_NUM: + ret = virtio_queue_get_num(vdev, vdev->queue_sel); + break; + case VIRTIO_PCI_QUEUE_SEL: + ret = vdev->queue_sel; + break; + case VIRTIO_PCI_STATUS: + ret = vdev->status; + break; + case VIRTIO_PCI_ISR: + /* reading from the ISR also clears it. */ + ret = qatomic_xchg(&vdev->isr, 0); + pci_irq_deassert(&proxy->pci_dev); + break; + case VIRTIO_MSI_CONFIG_VECTOR: + ret = vdev->config_vector; + break; + case VIRTIO_MSI_QUEUE_VECTOR: + ret = virtio_queue_vector(vdev, vdev->queue_sel); + break; + default: + break; + } + + return ret; +} + +static uint64_t virtio_pci_config_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); + uint64_t val = 0; + + if (vdev == NULL) { + return UINT64_MAX; + } + + if (addr < config) { + return virtio_ioport_read(proxy, addr); + } + addr -= config; + + switch (size) { + case 1: + val = virtio_config_readb(vdev, addr); + break; + case 2: + val = virtio_config_readw(vdev, addr); + if (virtio_is_big_endian(vdev)) { + val = bswap16(val); + } + break; + case 4: + val = virtio_config_readl(vdev, addr); + if (virtio_is_big_endian(vdev)) { + val = bswap32(val); + } + break; + } + return val; +} + +static void virtio_pci_config_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + uint32_t config = VIRTIO_PCI_CONFIG_SIZE(&proxy->pci_dev); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev == NULL) { + return; + } + + if (addr < config) { + virtio_ioport_write(proxy, addr, val); + return; + } + addr -= config; + /* + * Virtio-PCI is odd. Ioports are LE but config space is target native + * endian. + */ + switch (size) { + case 1: + virtio_config_writeb(vdev, addr, val); + break; + case 2: + if (virtio_is_big_endian(vdev)) { + val = bswap16(val); + } + virtio_config_writew(vdev, addr, val); + break; + case 4: + if (virtio_is_big_endian(vdev)) { + val = bswap32(val); + } + virtio_config_writel(vdev, addr, val); + break; + } +} + +static const MemoryRegionOps virtio_pci_config_ops = { + .read = virtio_pci_config_read, + .write = virtio_pci_config_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static MemoryRegion *virtio_address_space_lookup(VirtIOPCIProxy *proxy, + hwaddr *off, int len) +{ + int i; + VirtIOPCIRegion *reg; + + for (i = 0; i < ARRAY_SIZE(proxy->regs); ++i) { + reg = &proxy->regs[i]; + if (*off >= reg->offset && + *off + len <= reg->offset + reg->size) { + *off -= reg->offset; + return ®->mr; + } + } + + return NULL; +} + +/* Below are generic functions to do memcpy from/to an address space, + * without byteswaps, with input validation. + * + * As regular address_space_* APIs all do some kind of byteswap at least for + * some host/target combinations, we are forced to explicitly convert to a + * known-endianness integer value. + * It doesn't really matter which endian format to go through, so the code + * below selects the endian that causes the least amount of work on the given + * host. + * + * Note: host pointer must be aligned. + */ +static +void virtio_address_space_write(VirtIOPCIProxy *proxy, hwaddr addr, + const uint8_t *buf, int len) +{ + uint64_t val; + MemoryRegion *mr; + + /* address_space_* APIs assume an aligned address. + * As address is under guest control, handle illegal values. + */ + addr &= ~(len - 1); + + mr = virtio_address_space_lookup(proxy, &addr, len); + if (!mr) { + return; + } + + /* Make sure caller aligned buf properly */ + assert(!(((uintptr_t)buf) & (len - 1))); + + switch (len) { + case 1: + val = pci_get_byte(buf); + break; + case 2: + val = pci_get_word(buf); + break; + case 4: + val = pci_get_long(buf); + break; + default: + /* As length is under guest control, handle illegal values. */ + return; + } + memory_region_dispatch_write(mr, addr, val, size_memop(len) | MO_LE, + MEMTXATTRS_UNSPECIFIED); +} + +static void +virtio_address_space_read(VirtIOPCIProxy *proxy, hwaddr addr, + uint8_t *buf, int len) +{ + uint64_t val; + MemoryRegion *mr; + + /* address_space_* APIs assume an aligned address. + * As address is under guest control, handle illegal values. + */ + addr &= ~(len - 1); + + mr = virtio_address_space_lookup(proxy, &addr, len); + if (!mr) { + return; + } + + /* Make sure caller aligned buf properly */ + assert(!(((uintptr_t)buf) & (len - 1))); + + memory_region_dispatch_read(mr, addr, &val, size_memop(len) | MO_LE, + MEMTXATTRS_UNSPECIFIED); + switch (len) { + case 1: + pci_set_byte(buf, val); + break; + case 2: + pci_set_word(buf, val); + break; + case 4: + pci_set_long(buf, val); + break; + default: + /* As length is under guest control, handle illegal values. */ + break; + } +} + +static void virtio_write_config(PCIDevice *pci_dev, uint32_t address, + uint32_t val, int len) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + struct virtio_pci_cfg_cap *cfg; + + pci_default_write_config(pci_dev, address, val, len); + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) { + pcie_cap_flr_write_config(pci_dev, address, val, len); + } + + if (range_covers_byte(address, len, PCI_COMMAND)) { + if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { + virtio_set_disabled(vdev, true); + virtio_pci_stop_ioeventfd(proxy); + virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK); + } else { + virtio_set_disabled(vdev, false); + } + } + + if (proxy->config_cap && + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, + pci_cfg_data), + sizeof cfg->pci_cfg_data)) { + uint32_t off; + uint32_t len; + + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); + off = le32_to_cpu(cfg->cap.offset); + len = le32_to_cpu(cfg->cap.length); + + if (len == 1 || len == 2 || len == 4) { + assert(len <= sizeof cfg->pci_cfg_data); + virtio_address_space_write(proxy, off, cfg->pci_cfg_data, len); + } + } +} + +static uint32_t virtio_read_config(PCIDevice *pci_dev, + uint32_t address, int len) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev); + struct virtio_pci_cfg_cap *cfg; + + if (proxy->config_cap && + ranges_overlap(address, len, proxy->config_cap + offsetof(struct virtio_pci_cfg_cap, + pci_cfg_data), + sizeof cfg->pci_cfg_data)) { + uint32_t off; + uint32_t len; + + cfg = (void *)(proxy->pci_dev.config + proxy->config_cap); + off = le32_to_cpu(cfg->cap.offset); + len = le32_to_cpu(cfg->cap.length); + + if (len == 1 || len == 2 || len == 4) { + assert(len <= sizeof cfg->pci_cfg_data); + virtio_address_space_read(proxy, off, cfg->pci_cfg_data, len); + } + } + + return pci_default_read_config(pci_dev, address, len); +} + +static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, + unsigned int queue_no, + unsigned int vector) +{ + VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; + int ret; + + if (irqfd->users == 0) { + KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state); + ret = kvm_irqchip_add_msi_route(&c, vector, &proxy->pci_dev); + if (ret < 0) { + return ret; + } + kvm_irqchip_commit_route_changes(&c); + irqfd->virq = ret; + } + irqfd->users++; + return 0; +} + +static void kvm_virtio_pci_vq_vector_release(VirtIOPCIProxy *proxy, + unsigned int vector) +{ + VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; + if (--irqfd->users == 0) { + kvm_irqchip_release_virq(kvm_state, irqfd->virq); + } +} + +static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, + unsigned int queue_no, + unsigned int vector) +{ + VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtQueue *vq = virtio_get_queue(vdev, queue_no); + EventNotifier *n = virtio_queue_get_guest_notifier(vq); + return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq); +} + +static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, + unsigned int queue_no, + unsigned int vector) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtQueue *vq = virtio_get_queue(vdev, queue_no); + EventNotifier *n = virtio_queue_get_guest_notifier(vq); + VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; + int ret; + + ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq); + assert(ret == 0); +} + +static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs) +{ + PCIDevice *dev = &proxy->pci_dev; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + unsigned int vector; + int ret, queue_no; + + for (queue_no = 0; queue_no < nvqs; queue_no++) { + if (!virtio_queue_get_num(vdev, queue_no)) { + break; + } + vector = virtio_queue_vector(vdev, queue_no); + if (vector >= msix_nr_vectors_allocated(dev)) { + continue; + } + ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector); + if (ret < 0) { + goto undo; + } + /* If guest supports masking, set up irqfd now. + * Otherwise, delay until unmasked in the frontend. + */ + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector); + if (ret < 0) { + kvm_virtio_pci_vq_vector_release(proxy, vector); + goto undo; + } + } + } + return 0; + +undo: + while (--queue_no >= 0) { + vector = virtio_queue_vector(vdev, queue_no); + if (vector >= msix_nr_vectors_allocated(dev)) { + continue; + } + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); + } + kvm_virtio_pci_vq_vector_release(proxy, vector); + } + return ret; +} + +static void kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs) +{ + PCIDevice *dev = &proxy->pci_dev; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + unsigned int vector; + int queue_no; + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + + for (queue_no = 0; queue_no < nvqs; queue_no++) { + if (!virtio_queue_get_num(vdev, queue_no)) { + break; + } + vector = virtio_queue_vector(vdev, queue_no); + if (vector >= msix_nr_vectors_allocated(dev)) { + continue; + } + /* If guest supports masking, clean up irqfd now. + * Otherwise, it was cleaned when masked in the frontend. + */ + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); + } + kvm_virtio_pci_vq_vector_release(proxy, vector); + } +} + +static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, + unsigned int queue_no, + unsigned int vector, + MSIMessage msg) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + VirtQueue *vq = virtio_get_queue(vdev, queue_no); + EventNotifier *n = virtio_queue_get_guest_notifier(vq); + VirtIOIRQFD *irqfd; + int ret = 0; + + if (proxy->vector_irqfd) { + irqfd = &proxy->vector_irqfd[vector]; + if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, + &proxy->pci_dev); + if (ret < 0) { + return ret; + } + kvm_irqchip_commit_routes(kvm_state); + } + } + + /* If guest supports masking, irqfd is already setup, unmask it. + * Otherwise, set it up now. + */ + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + k->guest_notifier_mask(vdev, queue_no, false); + /* Test after unmasking to avoid losing events. */ + if (k->guest_notifier_pending && + k->guest_notifier_pending(vdev, queue_no)) { + event_notifier_set(n); + } + } else { + ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector); + } + return ret; +} + +static void virtio_pci_vq_vector_mask(VirtIOPCIProxy *proxy, + unsigned int queue_no, + unsigned int vector) +{ + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + + /* If guest supports masking, keep irqfd but mask it. + * Otherwise, clean it up now. + */ + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { + k->guest_notifier_mask(vdev, queue_no, true); + } else { + kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); + } +} + +static int virtio_pci_vector_unmask(PCIDevice *dev, unsigned vector, + MSIMessage msg) +{ + VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtQueue *vq = virtio_vector_first_queue(vdev, vector); + int ret, index, unmasked = 0; + + while (vq) { + index = virtio_get_queue_index(vq); + if (!virtio_queue_get_num(vdev, index)) { + break; + } + if (index < proxy->nvqs_with_notifiers) { + ret = virtio_pci_vq_vector_unmask(proxy, index, vector, msg); + if (ret < 0) { + goto undo; + } + ++unmasked; + } + vq = virtio_vector_next_queue(vq); + } + + return 0; + +undo: + vq = virtio_vector_first_queue(vdev, vector); + while (vq && unmasked >= 0) { + index = virtio_get_queue_index(vq); + if (index < proxy->nvqs_with_notifiers) { + virtio_pci_vq_vector_mask(proxy, index, vector); + --unmasked; + } + vq = virtio_vector_next_queue(vq); + } + return ret; +} + +static void virtio_pci_vector_mask(PCIDevice *dev, unsigned vector) +{ + VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtQueue *vq = virtio_vector_first_queue(vdev, vector); + int index; + + while (vq) { + index = virtio_get_queue_index(vq); + if (!virtio_queue_get_num(vdev, index)) { + break; + } + if (index < proxy->nvqs_with_notifiers) { + virtio_pci_vq_vector_mask(proxy, index, vector); + } + vq = virtio_vector_next_queue(vq); + } +} + +static void virtio_pci_vector_poll(PCIDevice *dev, + unsigned int vector_start, + unsigned int vector_end) +{ + VirtIOPCIProxy *proxy = container_of(dev, VirtIOPCIProxy, pci_dev); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + int queue_no; + unsigned int vector; + EventNotifier *notifier; + VirtQueue *vq; + + for (queue_no = 0; queue_no < proxy->nvqs_with_notifiers; queue_no++) { + if (!virtio_queue_get_num(vdev, queue_no)) { + break; + } + vector = virtio_queue_vector(vdev, queue_no); + if (vector < vector_start || vector >= vector_end || + !msix_is_masked(dev, vector)) { + continue; + } + vq = virtio_get_queue(vdev, queue_no); + notifier = virtio_queue_get_guest_notifier(vq); + if (k->guest_notifier_pending) { + if (k->guest_notifier_pending(vdev, queue_no)) { + msix_set_pending(dev, vector); + } + } else if (event_notifier_test_and_clear(notifier)) { + msix_set_pending(dev, vector); + } + } +} + +static int virtio_pci_set_guest_notifier(DeviceState *d, int n, bool assign, + bool with_irqfd) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + VirtQueue *vq = virtio_get_queue(vdev, n); + EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); + + if (assign) { + int r = event_notifier_init(notifier, 0); + if (r < 0) { + return r; + } + virtio_queue_set_guest_notifier_fd_handler(vq, true, with_irqfd); + } else { + virtio_queue_set_guest_notifier_fd_handler(vq, false, with_irqfd); + event_notifier_cleanup(notifier); + } + + if (!msix_enabled(&proxy->pci_dev) && + vdev->use_guest_notifier_mask && + vdc->guest_notifier_mask) { + vdc->guest_notifier_mask(vdev, n, !assign); + } + + return 0; +} + +static bool virtio_pci_query_guest_notifiers(DeviceState *d) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + return msix_enabled(&proxy->pci_dev); +} + +static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + int r, n; + bool with_irqfd = msix_enabled(&proxy->pci_dev) && + kvm_msi_via_irqfd_enabled(); + + nvqs = MIN(nvqs, VIRTIO_QUEUE_MAX); + + /* + * When deassigning, pass a consistent nvqs value to avoid leaking + * notifiers. But first check we've actually been configured, exit + * early if we haven't. + */ + if (!assign && !proxy->nvqs_with_notifiers) { + return 0; + } + assert(assign || nvqs == proxy->nvqs_with_notifiers); + + proxy->nvqs_with_notifiers = nvqs; + + /* Must unset vector notifier while guest notifier is still assigned */ + if ((proxy->vector_irqfd || k->guest_notifier_mask) && !assign) { + msix_unset_vector_notifiers(&proxy->pci_dev); + if (proxy->vector_irqfd) { + kvm_virtio_pci_vector_release(proxy, nvqs); + g_free(proxy->vector_irqfd); + proxy->vector_irqfd = NULL; + } + } + + for (n = 0; n < nvqs; n++) { + if (!virtio_queue_get_num(vdev, n)) { + break; + } + + r = virtio_pci_set_guest_notifier(d, n, assign, with_irqfd); + if (r < 0) { + goto assign_error; + } + } + + /* Must set vector notifier after guest notifier has been assigned */ + if ((with_irqfd || k->guest_notifier_mask) && assign) { + if (with_irqfd) { + proxy->vector_irqfd = + g_malloc0(sizeof(*proxy->vector_irqfd) * + msix_nr_vectors_allocated(&proxy->pci_dev)); + r = kvm_virtio_pci_vector_use(proxy, nvqs); + if (r < 0) { + goto assign_error; + } + } + r = msix_set_vector_notifiers(&proxy->pci_dev, + virtio_pci_vector_unmask, + virtio_pci_vector_mask, + virtio_pci_vector_poll); + if (r < 0) { + goto notifiers_error; + } + } + + return 0; + +notifiers_error: + if (with_irqfd) { + assert(assign); + kvm_virtio_pci_vector_release(proxy, nvqs); + } + +assign_error: + /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */ + assert(assign); + while (--n >= 0) { + virtio_pci_set_guest_notifier(d, n, !assign, with_irqfd); + } + return r; +} + +static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n, + MemoryRegion *mr, bool assign) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + int offset; + + if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) || + virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) { + return -1; + } + + if (assign) { + offset = virtio_pci_queue_mem_mult(proxy) * n; + memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1); + } else { + memory_region_del_subregion(&proxy->notify.mr, mr); + } + + return 0; +} + +static void virtio_pci_vmstate_change(DeviceState *d, bool running) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (running) { + /* Old QEMU versions did not set bus master enable on status write. + * Detect DRIVER set and enable it. + */ + if ((proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION) && + (vdev->status & VIRTIO_CONFIG_S_DRIVER) && + !(proxy->pci_dev.config[PCI_COMMAND] & PCI_COMMAND_MASTER)) { + pci_default_write_config(&proxy->pci_dev, PCI_COMMAND, + proxy->pci_dev.config[PCI_COMMAND] | + PCI_COMMAND_MASTER, 1); + } + virtio_pci_start_ioeventfd(proxy); + } else { + virtio_pci_stop_ioeventfd(proxy); + } +} + +/* + * virtio-pci: This is the PCIDevice which has a virtio-pci-bus. + */ + +static int virtio_pci_query_nvectors(DeviceState *d) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + + return proxy->nvectors; +} + +static AddressSpace *virtio_pci_get_dma_as(DeviceState *d) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + PCIDevice *dev = &proxy->pci_dev; + + return pci_get_address_space(dev); +} + +static bool virtio_pci_iommu_enabled(DeviceState *d) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + PCIDevice *dev = &proxy->pci_dev; + AddressSpace *dma_as = pci_device_iommu_address_space(dev); + + if (dma_as == &address_space_memory) { + return false; + } + + return true; +} + +static bool virtio_pci_queue_enabled(DeviceState *d, int n) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + return proxy->vqs[n].enabled; + } + + return virtio_queue_enabled_legacy(vdev, n); +} + +static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, + struct virtio_pci_cap *cap) +{ + PCIDevice *dev = &proxy->pci_dev; + int offset; + + offset = pci_add_capability(dev, PCI_CAP_ID_VNDR, 0, + cap->cap_len, &error_abort); + + assert(cap->cap_len >= sizeof *cap); + memcpy(dev->config + offset + PCI_CAP_FLAGS, &cap->cap_len, + cap->cap_len - PCI_CAP_FLAGS); + + return offset; +} + +static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint32_t val = 0; + int i; + + if (vdev == NULL) { + return UINT64_MAX; + } + + switch (addr) { + case VIRTIO_PCI_COMMON_DFSELECT: + val = proxy->dfselect; + break; + case VIRTIO_PCI_COMMON_DF: + if (proxy->dfselect <= 1) { + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + + val = (vdev->host_features & ~vdc->legacy_features) >> + (32 * proxy->dfselect); + } + break; + case VIRTIO_PCI_COMMON_GFSELECT: + val = proxy->gfselect; + break; + case VIRTIO_PCI_COMMON_GF: + if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + val = proxy->guest_features[proxy->gfselect]; + } + break; + case VIRTIO_PCI_COMMON_MSIX: + val = vdev->config_vector; + break; + case VIRTIO_PCI_COMMON_NUMQ: + for (i = 0; i < VIRTIO_QUEUE_MAX; ++i) { + if (virtio_queue_get_num(vdev, i)) { + val = i + 1; + } + } + break; + case VIRTIO_PCI_COMMON_STATUS: + val = vdev->status; + break; + case VIRTIO_PCI_COMMON_CFGGENERATION: + val = vdev->generation; + break; + case VIRTIO_PCI_COMMON_Q_SELECT: + val = vdev->queue_sel; + break; + case VIRTIO_PCI_COMMON_Q_SIZE: + val = virtio_queue_get_num(vdev, vdev->queue_sel); + break; + case VIRTIO_PCI_COMMON_Q_MSIX: + val = virtio_queue_vector(vdev, vdev->queue_sel); + break; + case VIRTIO_PCI_COMMON_Q_ENABLE: + val = proxy->vqs[vdev->queue_sel].enabled; + break; + case VIRTIO_PCI_COMMON_Q_NOFF: + /* Simply map queues in order */ + val = vdev->queue_sel; + break; + case VIRTIO_PCI_COMMON_Q_DESCLO: + val = proxy->vqs[vdev->queue_sel].desc[0]; + break; + case VIRTIO_PCI_COMMON_Q_DESCHI: + val = proxy->vqs[vdev->queue_sel].desc[1]; + break; + case VIRTIO_PCI_COMMON_Q_AVAILLO: + val = proxy->vqs[vdev->queue_sel].avail[0]; + break; + case VIRTIO_PCI_COMMON_Q_AVAILHI: + val = proxy->vqs[vdev->queue_sel].avail[1]; + break; + case VIRTIO_PCI_COMMON_Q_USEDLO: + val = proxy->vqs[vdev->queue_sel].used[0]; + break; + case VIRTIO_PCI_COMMON_Q_USEDHI: + val = proxy->vqs[vdev->queue_sel].used[1]; + break; + case VIRTIO_PCI_COMMON_Q_RESET: + val = proxy->vqs[vdev->queue_sel].reset; + break; + default: + val = 0; + } + + return val; +} + +static void virtio_pci_common_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint16_t vector; + + if (vdev == NULL) { + return; + } + + switch (addr) { + case VIRTIO_PCI_COMMON_DFSELECT: + proxy->dfselect = val; + break; + case VIRTIO_PCI_COMMON_GFSELECT: + proxy->gfselect = val; + break; + case VIRTIO_PCI_COMMON_GF: + if (proxy->gfselect < ARRAY_SIZE(proxy->guest_features)) { + proxy->guest_features[proxy->gfselect] = val; + virtio_set_features(vdev, + (((uint64_t)proxy->guest_features[1]) << 32) | + proxy->guest_features[0]); + } + break; + case VIRTIO_PCI_COMMON_MSIX: + if (vdev->config_vector != VIRTIO_NO_VECTOR) { + msix_vector_unuse(&proxy->pci_dev, vdev->config_vector); + } + /* Make it possible for guest to discover an error took place. */ + if (val < proxy->nvectors) { + msix_vector_use(&proxy->pci_dev, val); + } else { + val = VIRTIO_NO_VECTOR; + } + vdev->config_vector = val; + break; + case VIRTIO_PCI_COMMON_STATUS: + if (!(val & VIRTIO_CONFIG_S_DRIVER_OK)) { + virtio_pci_stop_ioeventfd(proxy); + } + + virtio_set_status(vdev, val & 0xFF); + + if (val & VIRTIO_CONFIG_S_DRIVER_OK) { + virtio_pci_start_ioeventfd(proxy); + } + + if (vdev->status == 0) { + virtio_pci_reset(DEVICE(proxy)); + } + + break; + case VIRTIO_PCI_COMMON_Q_SELECT: + if (val < VIRTIO_QUEUE_MAX) { + vdev->queue_sel = val; + } + break; + case VIRTIO_PCI_COMMON_Q_SIZE: + proxy->vqs[vdev->queue_sel].num = val; + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); + break; + case VIRTIO_PCI_COMMON_Q_MSIX: + vector = virtio_queue_vector(vdev, vdev->queue_sel); + if (vector != VIRTIO_NO_VECTOR) { + msix_vector_unuse(&proxy->pci_dev, vector); + } + /* Make it possible for guest to discover an error took place. */ + if (val < proxy->nvectors) { + msix_vector_use(&proxy->pci_dev, val); + } else { + val = VIRTIO_NO_VECTOR; + } + virtio_queue_set_vector(vdev, vdev->queue_sel, val); + break; + case VIRTIO_PCI_COMMON_Q_ENABLE: + if (val == 1) { + virtio_queue_set_num(vdev, vdev->queue_sel, + proxy->vqs[vdev->queue_sel].num); + virtio_queue_set_rings(vdev, vdev->queue_sel, + ((uint64_t)proxy->vqs[vdev->queue_sel].desc[1]) << 32 | + proxy->vqs[vdev->queue_sel].desc[0], + ((uint64_t)proxy->vqs[vdev->queue_sel].avail[1]) << 32 | + proxy->vqs[vdev->queue_sel].avail[0], + ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 | + proxy->vqs[vdev->queue_sel].used[0]); + proxy->vqs[vdev->queue_sel].enabled = 1; + proxy->vqs[vdev->queue_sel].reset = 0; + virtio_queue_enable(vdev, vdev->queue_sel); + } else { + virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val); + } + break; + case VIRTIO_PCI_COMMON_Q_DESCLO: + proxy->vqs[vdev->queue_sel].desc[0] = val; + break; + case VIRTIO_PCI_COMMON_Q_DESCHI: + proxy->vqs[vdev->queue_sel].desc[1] = val; + break; + case VIRTIO_PCI_COMMON_Q_AVAILLO: + proxy->vqs[vdev->queue_sel].avail[0] = val; + break; + case VIRTIO_PCI_COMMON_Q_AVAILHI: + proxy->vqs[vdev->queue_sel].avail[1] = val; + break; + case VIRTIO_PCI_COMMON_Q_USEDLO: + proxy->vqs[vdev->queue_sel].used[0] = val; + break; + case VIRTIO_PCI_COMMON_Q_USEDHI: + proxy->vqs[vdev->queue_sel].used[1] = val; + break; + case VIRTIO_PCI_COMMON_Q_RESET: + if (val == 1) { + proxy->vqs[vdev->queue_sel].reset = 1; + + virtio_queue_reset(vdev, vdev->queue_sel); + + proxy->vqs[vdev->queue_sel].reset = 0; + proxy->vqs[vdev->queue_sel].enabled = 0; + } + break; + default: + break; + } +} + + +static uint64_t virtio_pci_notify_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + if (virtio_bus_get_device(&proxy->bus) == NULL) { + return UINT64_MAX; + } + + return 0; +} + +static void virtio_pci_notify_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + unsigned queue = addr / virtio_pci_queue_mem_mult(proxy); + + if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) { + trace_virtio_pci_notify_write(addr, val, size); + virtio_queue_notify(vdev, queue); + } +} + +static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + unsigned queue = val; + + if (vdev != NULL && queue < VIRTIO_QUEUE_MAX) { + trace_virtio_pci_notify_write_pio(addr, val, size); + virtio_queue_notify(vdev, queue); + } +} + +static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint64_t val; + + if (vdev == NULL) { + return UINT64_MAX; + } + + val = qatomic_xchg(&vdev->isr, 0); + pci_irq_deassert(&proxy->pci_dev); + return val; +} + +static void virtio_pci_isr_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ +} + +static uint64_t virtio_pci_device_read(void *opaque, hwaddr addr, + unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + uint64_t val; + + if (vdev == NULL) { + return UINT64_MAX; + } + + switch (size) { + case 1: + val = virtio_config_modern_readb(vdev, addr); + break; + case 2: + val = virtio_config_modern_readw(vdev, addr); + break; + case 4: + val = virtio_config_modern_readl(vdev, addr); + break; + default: + val = 0; + break; + } + return val; +} + +static void virtio_pci_device_write(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIOPCIProxy *proxy = opaque; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (vdev == NULL) { + return; + } + + switch (size) { + case 1: + virtio_config_modern_writeb(vdev, addr, val); + break; + case 2: + virtio_config_modern_writew(vdev, addr, val); + break; + case 4: + virtio_config_modern_writel(vdev, addr, val); + break; + } +} + +static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy, + const char *vdev_name) +{ + static const MemoryRegionOps common_ops = { + .read = virtio_pci_common_read, + .write = virtio_pci_common_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + static const MemoryRegionOps isr_ops = { + .read = virtio_pci_isr_read, + .write = virtio_pci_isr_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + static const MemoryRegionOps device_ops = { + .read = virtio_pci_device_read, + .write = virtio_pci_device_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + static const MemoryRegionOps notify_ops = { + .read = virtio_pci_notify_read, + .write = virtio_pci_notify_write, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + static const MemoryRegionOps notify_pio_ops = { + .read = virtio_pci_notify_read, + .write = virtio_pci_notify_write_pio, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + g_autoptr(GString) name = g_string_new(NULL); + + g_string_printf(name, "virtio-pci-common-%s", vdev_name); + memory_region_init_io(&proxy->common.mr, OBJECT(proxy), + &common_ops, + proxy, + name->str, + proxy->common.size); + + g_string_printf(name, "virtio-pci-isr-%s", vdev_name); + memory_region_init_io(&proxy->isr.mr, OBJECT(proxy), + &isr_ops, + proxy, + name->str, + proxy->isr.size); + + g_string_printf(name, "virtio-pci-device-%s", vdev_name); + memory_region_init_io(&proxy->device.mr, OBJECT(proxy), + &device_ops, + proxy, + name->str, + proxy->device.size); + + g_string_printf(name, "virtio-pci-notify-%s", vdev_name); + memory_region_init_io(&proxy->notify.mr, OBJECT(proxy), + ¬ify_ops, + proxy, + name->str, + proxy->notify.size); + + g_string_printf(name, "virtio-pci-notify-pio-%s", vdev_name); + memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy), + ¬ify_pio_ops, + proxy, + name->str, + proxy->notify_pio.size); +} + +static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region, + struct virtio_pci_cap *cap, + MemoryRegion *mr, + uint8_t bar) +{ + memory_region_add_subregion(mr, region->offset, ®ion->mr); + + cap->cfg_type = region->type; + cap->bar = bar; + cap->offset = cpu_to_le32(region->offset); + cap->length = cpu_to_le32(region->size); + virtio_pci_add_mem_cap(proxy, cap); + +} + +static void virtio_pci_modern_mem_region_map(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region, + struct virtio_pci_cap *cap) +{ + virtio_pci_modern_region_map(proxy, region, cap, + &proxy->modern_bar, proxy->modern_mem_bar_idx); +} + +static void virtio_pci_modern_io_region_map(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region, + struct virtio_pci_cap *cap) +{ + virtio_pci_modern_region_map(proxy, region, cap, + &proxy->io_bar, proxy->modern_io_bar_idx); +} + +static void virtio_pci_modern_mem_region_unmap(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region) +{ + memory_region_del_subregion(&proxy->modern_bar, + ®ion->mr); +} + +static void virtio_pci_modern_io_region_unmap(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region) +{ + memory_region_del_subregion(&proxy->io_bar, + ®ion->mr); +} + +static void virtio_pci_pre_plugged(DeviceState *d, Error **errp) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + if (virtio_pci_modern(proxy)) { + virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1); + } + + virtio_add_feature(&vdev->host_features, VIRTIO_F_BAD_FEATURE); +} + +/* This is called by virtio-bus just after the device is plugged. */ +static void virtio_pci_device_plugged(DeviceState *d, Error **errp) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + VirtioBusState *bus = &proxy->bus; + bool legacy = virtio_pci_legacy(proxy); + bool modern; + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + uint8_t *config; + uint32_t size; + VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); + + /* + * Virtio capabilities present without + * VIRTIO_F_VERSION_1 confuses guests + */ + if (!proxy->ignore_backend_features && + !virtio_has_feature(vdev->host_features, VIRTIO_F_VERSION_1)) { + virtio_pci_disable_modern(proxy); + + if (!legacy) { + error_setg(errp, "Device doesn't support modern mode, and legacy" + " mode is disabled"); + error_append_hint(errp, "Set disable-legacy to off\n"); + + return; + } + } + + modern = virtio_pci_modern(proxy); + + config = proxy->pci_dev.config; + if (proxy->class_code) { + pci_config_set_class(config, proxy->class_code); + } + + if (legacy) { + if (!virtio_legacy_allowed(vdev)) { + /* + * To avoid migration issues, we allow legacy mode when legacy + * check is disabled in the old machine types (< 5.1). + */ + if (virtio_legacy_check_disabled(vdev)) { + warn_report("device is modern-only, but for backward " + "compatibility legacy is allowed"); + } else { + error_setg(errp, + "device is modern-only, use disable-legacy=on"); + return; + } + } + if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { + error_setg(errp, "VIRTIO_F_IOMMU_PLATFORM was supported by" + " neither legacy nor transitional device"); + return; + } + /* + * Legacy and transitional devices use specific subsystem IDs. + * Note that the subsystem vendor ID (config + PCI_SUBSYSTEM_VENDOR_ID) + * is set to PCI_SUBVENDOR_ID_REDHAT_QUMRANET by default. + */ + pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus)); + } else { + /* pure virtio-1.0 */ + pci_set_word(config + PCI_VENDOR_ID, + PCI_VENDOR_ID_REDHAT_QUMRANET); + pci_set_word(config + PCI_DEVICE_ID, + PCI_DEVICE_ID_VIRTIO_10_BASE + virtio_bus_get_vdev_id(bus)); + pci_config_set_revision(config, 1); + } + config[PCI_INTERRUPT_PIN] = 1; + + + if (modern) { + struct virtio_pci_cap cap = { + .cap_len = sizeof cap, + }; + struct virtio_pci_notify_cap notify = { + .cap.cap_len = sizeof notify, + .notify_off_multiplier = + cpu_to_le32(virtio_pci_queue_mem_mult(proxy)), + }; + struct virtio_pci_cfg_cap cfg = { + .cap.cap_len = sizeof cfg, + .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG, + }; + struct virtio_pci_notify_cap notify_pio = { + .cap.cap_len = sizeof notify, + .notify_off_multiplier = cpu_to_le32(0x0), + }; + + struct virtio_pci_cfg_cap *cfg_mask; + + virtio_pci_modern_regions_init(proxy, vdev->name); + + virtio_pci_modern_mem_region_map(proxy, &proxy->common, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->notify, ¬ify.cap); + + if (modern_pio) { + memory_region_init(&proxy->io_bar, OBJECT(proxy), + "virtio-pci-io", 0x4); + + pci_register_bar(&proxy->pci_dev, proxy->modern_io_bar_idx, + PCI_BASE_ADDRESS_SPACE_IO, &proxy->io_bar); + + virtio_pci_modern_io_region_map(proxy, &proxy->notify_pio, + ¬ify_pio.cap); + } + + pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar_idx, + PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_PREFETCH | + PCI_BASE_ADDRESS_MEM_TYPE_64, + &proxy->modern_bar); + + proxy->config_cap = virtio_pci_add_mem_cap(proxy, &cfg.cap); + cfg_mask = (void *)(proxy->pci_dev.wmask + proxy->config_cap); + pci_set_byte(&cfg_mask->cap.bar, ~0x0); + pci_set_long((uint8_t *)&cfg_mask->cap.offset, ~0x0); + pci_set_long((uint8_t *)&cfg_mask->cap.length, ~0x0); + pci_set_long(cfg_mask->pci_cfg_data, ~0x0); + } + + if (proxy->nvectors) { + int err = msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors, + proxy->msix_bar_idx, NULL); + if (err) { + /* Notice when a system that supports MSIx can't initialize it */ + if (err != -ENOTSUP) { + warn_report("unable to init msix vectors to %" PRIu32, + proxy->nvectors); + } + proxy->nvectors = 0; + } + } + + proxy->pci_dev.config_write = virtio_write_config; + proxy->pci_dev.config_read = virtio_read_config; + + if (legacy) { + size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) + + virtio_bus_get_vdev_config_len(bus); + size = pow2ceil(size); + + memory_region_init_io(&proxy->bar, OBJECT(proxy), + &virtio_pci_config_ops, + proxy, "virtio-pci", size); + + pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx, + PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar); + } +} + +static void virtio_pci_device_unplugged(DeviceState *d) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(d); + bool modern = virtio_pci_modern(proxy); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; + + virtio_pci_stop_ioeventfd(proxy); + + if (modern) { + virtio_pci_modern_mem_region_unmap(proxy, &proxy->common); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); + if (modern_pio) { + virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); + } + } +} + +static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev); + VirtioPCIClass *k = VIRTIO_PCI_GET_CLASS(pci_dev); + bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) && + !pci_bus_is_root(pci_get_bus(pci_dev)); + + if (kvm_enabled() && !kvm_has_many_ioeventfds()) { + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; + } + + /* fd-based ioevents can't be synchronized in record/replay */ + if (replay_mode != REPLAY_MODE_NONE) { + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; + } + + /* + * virtio pci bar layout used by default. + * subclasses can re-arrange things if needed. + * + * region 0 -- virtio legacy io bar + * region 1 -- msi-x bar + * region 2 -- virtio modern io bar (off by default) + * region 4+5 -- virtio modern memory (64bit) bar + * + */ + proxy->legacy_io_bar_idx = 0; + proxy->msix_bar_idx = 1; + proxy->modern_io_bar_idx = 2; + proxy->modern_mem_bar_idx = 4; + + proxy->common.offset = 0x0; + proxy->common.size = 0x1000; + proxy->common.type = VIRTIO_PCI_CAP_COMMON_CFG; + + proxy->isr.offset = 0x1000; + proxy->isr.size = 0x1000; + proxy->isr.type = VIRTIO_PCI_CAP_ISR_CFG; + + proxy->device.offset = 0x2000; + proxy->device.size = 0x1000; + proxy->device.type = VIRTIO_PCI_CAP_DEVICE_CFG; + + proxy->notify.offset = 0x3000; + proxy->notify.size = virtio_pci_queue_mem_mult(proxy) * VIRTIO_QUEUE_MAX; + proxy->notify.type = VIRTIO_PCI_CAP_NOTIFY_CFG; + + proxy->notify_pio.offset = 0x0; + proxy->notify_pio.size = 0x4; + proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; + + /* subclasses can enforce modern, so do this unconditionally */ + memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", + /* PCI BAR regions must be powers of 2 */ + pow2ceil(proxy->notify.offset + proxy->notify.size)); + + if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { + proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; + } + + if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) { + error_setg(errp, "device cannot work as neither modern nor legacy mode" + " is enabled"); + error_append_hint(errp, "Set either disable-modern or disable-legacy" + " to off\n"); + return; + } + + if (pcie_port && pci_is_express(pci_dev)) { + int pos; + uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; + + pos = pcie_endpoint_cap_init(pci_dev, 0); + assert(pos > 0); + + pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, + PCI_PM_SIZEOF, errp); + if (pos < 0) { + return; + } + + pci_dev->exp.pm_cap = pos; + + /* + * Indicates that this function complies with revision 1.2 of the + * PCI Power Management Interface Specification. + */ + pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); + + if (proxy->flags & VIRTIO_PCI_FLAG_AER) { + pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset, + PCI_ERR_SIZEOF, NULL); + last_pcie_cap_offset += PCI_ERR_SIZEOF; + } + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) { + /* Init error enabling flags */ + pcie_cap_deverr_init(pci_dev); + } + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_LNKCTL) { + /* Init Link Control Register */ + pcie_cap_lnkctl_init(pci_dev); + } + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_PM) { + /* Init Power Management Control Register */ + pci_set_word(pci_dev->wmask + pos + PCI_PM_CTRL, + PCI_PM_CTRL_STATE_MASK); + } + + if (proxy->flags & VIRTIO_PCI_FLAG_ATS) { + pcie_ats_init(pci_dev, last_pcie_cap_offset, + proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED); + last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; + } + + if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) { + /* Set Function Level Reset capability bit */ + pcie_cap_flr_init(pci_dev); + } + } else { + /* + * make future invocations of pci_is_express() return false + * and pci_config_size() return PCI_CONFIG_SPACE_SIZE. + */ + pci_dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS; + } + + virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy); + if (k->realize) { + k->realize(proxy, errp); + } +} + +static void virtio_pci_exit(PCIDevice *pci_dev) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(pci_dev); + bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) && + !pci_bus_is_root(pci_get_bus(pci_dev)); + + msix_uninit_exclusive_bar(pci_dev); + if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port && + pci_is_express(pci_dev)) { + pcie_aer_exit(pci_dev); + } +} + +static void virtio_pci_reset(DeviceState *qdev) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); + VirtioBusState *bus = VIRTIO_BUS(&proxy->bus); + int i; + + virtio_bus_reset(bus); + msix_unuse_all_vectors(&proxy->pci_dev); + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].enabled = 0; + proxy->vqs[i].reset = 0; + proxy->vqs[i].num = 0; + proxy->vqs[i].desc[0] = proxy->vqs[i].desc[1] = 0; + proxy->vqs[i].avail[0] = proxy->vqs[i].avail[1] = 0; + proxy->vqs[i].used[0] = proxy->vqs[i].used[1] = 0; + } +} + +static void virtio_pci_bus_reset(DeviceState *qdev) +{ + PCIDevice *dev = PCI_DEVICE(qdev); + + virtio_pci_reset(qdev); + + if (pci_is_express(dev)) { + pcie_cap_deverr_reset(dev); + pcie_cap_lnkctl_reset(dev); + + pci_set_word(dev->config + dev->exp.pm_cap + PCI_PM_CTRL, 0); + } +} + +static Property virtio_pci_properties[] = { + DEFINE_PROP_BIT("virtio-pci-bus-master-bug-migration", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT, false), + DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true), + DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false), + DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false), + DEFINE_PROP_BIT("page-per-vq", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_PAGE_PER_VQ_BIT, false), + DEFINE_PROP_BOOL("x-ignore-backend-features", VirtIOPCIProxy, + ignore_backend_features, false), + DEFINE_PROP_BIT("ats", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_ATS_BIT, false), + DEFINE_PROP_BIT("x-ats-page-aligned", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED_BIT, true), + DEFINE_PROP_BIT("x-pcie-deverr-init", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_INIT_DEVERR_BIT, true), + DEFINE_PROP_BIT("x-pcie-lnkctl-init", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_INIT_LNKCTL_BIT, true), + DEFINE_PROP_BIT("x-pcie-pm-init", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_INIT_PM_BIT, true), + DEFINE_PROP_BIT("x-pcie-flr-init", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_INIT_FLR_BIT, true), + DEFINE_PROP_BIT("aer", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_AER_BIT, false), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) +{ + VirtioPCIClass *vpciklass = VIRTIO_PCI_GET_CLASS(qdev); + VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); + PCIDevice *pci_dev = &proxy->pci_dev; + + if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) && + virtio_pci_modern(proxy)) { + pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + } + + vpciklass->parent_dc_realize(qdev, errp); +} + +static void virtio_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass); + + device_class_set_props(dc, virtio_pci_properties); + k->realize = virtio_pci_realize; + k->exit = virtio_pci_exit; + k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + k->revision = VIRTIO_PCI_ABI_VERSION; + k->class_id = PCI_CLASS_OTHERS; + device_class_set_parent_realize(dc, virtio_pci_dc_realize, + &vpciklass->parent_dc_realize); + dc->reset = virtio_pci_bus_reset; +} + +static const TypeInfo virtio_pci_info = { + .name = TYPE_VIRTIO_PCI, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(VirtIOPCIProxy), + .class_init = virtio_pci_class_init, + .class_size = sizeof(VirtioPCIClass), + .abstract = true, +}; + +static Property virtio_pci_generic_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("disable-legacy", VirtIOPCIProxy, disable_legacy, + ON_OFF_AUTO_AUTO), + DEFINE_PROP_BOOL("disable-modern", VirtIOPCIProxy, disable_modern, false), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_pci_base_class_init(ObjectClass *klass, void *data) +{ + const VirtioPCIDeviceTypeInfo *t = data; + if (t->class_init) { + t->class_init(klass, NULL); + } +} + +static void virtio_pci_generic_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_pci_generic_properties); +} + +static void virtio_pci_transitional_instance_init(Object *obj) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(obj); + + proxy->disable_legacy = ON_OFF_AUTO_OFF; + proxy->disable_modern = false; +} + +static void virtio_pci_non_transitional_instance_init(Object *obj) +{ + VirtIOPCIProxy *proxy = VIRTIO_PCI(obj); + + proxy->disable_legacy = ON_OFF_AUTO_ON; + proxy->disable_modern = false; +} + +void virtio_pci_types_register(const VirtioPCIDeviceTypeInfo *t) +{ + char *base_name = NULL; + TypeInfo base_type_info = { + .name = t->base_name, + .parent = t->parent ? t->parent : TYPE_VIRTIO_PCI, + .instance_size = t->instance_size, + .instance_init = t->instance_init, + .class_size = t->class_size, + .abstract = true, + .interfaces = t->interfaces, + }; + TypeInfo generic_type_info = { + .name = t->generic_name, + .parent = base_type_info.name, + .class_init = virtio_pci_generic_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, + }; + + if (!base_type_info.name) { + /* No base type -> register a single generic device type */ + /* use intermediate %s-base-type to add generic device props */ + base_name = g_strdup_printf("%s-base-type", t->generic_name); + base_type_info.name = base_name; + base_type_info.class_init = virtio_pci_generic_class_init; + + generic_type_info.parent = base_name; + generic_type_info.class_init = virtio_pci_base_class_init; + generic_type_info.class_data = (void *)t; + + assert(!t->non_transitional_name); + assert(!t->transitional_name); + } else { + base_type_info.class_init = virtio_pci_base_class_init; + base_type_info.class_data = (void *)t; + } + + type_register(&base_type_info); + if (generic_type_info.name) { + type_register(&generic_type_info); + } + + if (t->non_transitional_name) { + const TypeInfo non_transitional_type_info = { + .name = t->non_transitional_name, + .parent = base_type_info.name, + .instance_init = virtio_pci_non_transitional_instance_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, + }; + type_register(&non_transitional_type_info); + } + + if (t->transitional_name) { + const TypeInfo transitional_type_info = { + .name = t->transitional_name, + .parent = base_type_info.name, + .instance_init = virtio_pci_transitional_instance_init, + .interfaces = (InterfaceInfo[]) { + /* + * Transitional virtio devices work only as Conventional PCI + * devices because they require PIO ports. + */ + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, + }; + type_register(&transitional_type_info); + } + g_free(base_name); +} + +unsigned virtio_pci_optimal_num_queues(unsigned fixed_queues) +{ + /* + * 1:1 vq to vCPU mapping is ideal because the same vCPU that submitted + * virtqueue buffers can handle their completion. When a different vCPU + * handles completion it may need to IPI the vCPU that submitted the + * request and this adds overhead. + * + * Virtqueues consume guest RAM and MSI-X vectors. This is wasteful in + * guests with very many vCPUs and a device that is only used by a few + * vCPUs. Unfortunately optimizing that case requires manual pinning inside + * the guest, so those users might as well manually set the number of + * queues. There is no upper limit that can be applied automatically and + * doing so arbitrarily would result in a sudden performance drop once the + * threshold number of vCPUs is exceeded. + */ + unsigned num_queues = current_machine->smp.cpus; + + /* + * The maximum number of MSI-X vectors is PCI_MSIX_FLAGS_QSIZE + 1, but the + * config change interrupt and the fixed virtqueues must be taken into + * account too. + */ + num_queues = MIN(num_queues, PCI_MSIX_FLAGS_QSIZE - fixed_queues); + + /* + * There is a limit to how many virtqueues a device can have. + */ + return MIN(num_queues, VIRTIO_QUEUE_MAX - fixed_queues); +} + +/* virtio-pci-bus */ + +static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, + VirtIOPCIProxy *dev) +{ + DeviceState *qdev = DEVICE(dev); + char virtio_bus_name[] = "virtio-bus"; + + qbus_init(bus, bus_size, TYPE_VIRTIO_PCI_BUS, qdev, virtio_bus_name); +} + +static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) +{ + BusClass *bus_class = BUS_CLASS(klass); + VirtioBusClass *k = VIRTIO_BUS_CLASS(klass); + bus_class->max_dev = 1; + k->notify = virtio_pci_notify; + k->save_config = virtio_pci_save_config; + k->load_config = virtio_pci_load_config; + k->save_queue = virtio_pci_save_queue; + k->load_queue = virtio_pci_load_queue; + k->save_extra_state = virtio_pci_save_extra_state; + k->load_extra_state = virtio_pci_load_extra_state; + k->has_extra_state = virtio_pci_has_extra_state; + k->query_guest_notifiers = virtio_pci_query_guest_notifiers; + k->set_guest_notifiers = virtio_pci_set_guest_notifiers; + k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr; + k->vmstate_change = virtio_pci_vmstate_change; + k->pre_plugged = virtio_pci_pre_plugged; + k->device_plugged = virtio_pci_device_plugged; + k->device_unplugged = virtio_pci_device_unplugged; + k->query_nvectors = virtio_pci_query_nvectors; + k->ioeventfd_enabled = virtio_pci_ioeventfd_enabled; + k->ioeventfd_assign = virtio_pci_ioeventfd_assign; + k->get_dma_as = virtio_pci_get_dma_as; + k->iommu_enabled = virtio_pci_iommu_enabled; + k->queue_enabled = virtio_pci_queue_enabled; +} + +static const TypeInfo virtio_pci_bus_info = { + .name = TYPE_VIRTIO_PCI_BUS, + .parent = TYPE_VIRTIO_BUS, + .instance_size = sizeof(VirtioPCIBusState), + .class_size = sizeof(VirtioPCIBusClass), + .class_init = virtio_pci_bus_class_init, +}; + +static void virtio_pci_register_types(void) +{ + /* Base types: */ + type_register_static(&virtio_pci_bus_info); + type_register_static(&virtio_pci_info); +} + +type_init(virtio_pci_register_types) + diff --git a/hw/virtio/virtio-pmem-pci.c b/hw/virtio/virtio-pmem-pci.c new file mode 100644 index 00000000..7d9f4ec1 --- /dev/null +++ b/hw/virtio/virtio-pmem-pci.c @@ -0,0 +1,127 @@ +/* + * Virtio PMEM PCI device + * + * Copyright (C) 2018-2019 Red Hat, Inc. + * + * Authors: + * Pankaj Gupta <pagupta@redhat.com> + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#include "virtio-pmem-pci.h" +#include "hw/mem/memory-device.h" +#include "qapi/error.h" + +static void virtio_pmem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOPMEMPCI *pmem_pci = VIRTIO_PMEM_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&pmem_pci->vdev); + + virtio_pci_force_virtio_1(vpci_dev); + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_pmem_pci_set_addr(MemoryDeviceState *md, uint64_t addr, + Error **errp) +{ + object_property_set_uint(OBJECT(md), VIRTIO_PMEM_ADDR_PROP, addr, errp); +} + +static uint64_t virtio_pmem_pci_get_addr(const MemoryDeviceState *md) +{ + return object_property_get_uint(OBJECT(md), VIRTIO_PMEM_ADDR_PROP, + &error_abort); +} + +static MemoryRegion *virtio_pmem_pci_get_memory_region(MemoryDeviceState *md, + Error **errp) +{ + VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md); + VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev); + VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem); + + return vpc->get_memory_region(pmem, errp); +} + +static uint64_t virtio_pmem_pci_get_plugged_size(const MemoryDeviceState *md, + Error **errp) +{ + VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md); + VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev); + VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem); + MemoryRegion *mr = vpc->get_memory_region(pmem, errp); + + /* the plugged size corresponds to the region size */ + return mr ? memory_region_size(mr) : 0; +} + +static void virtio_pmem_pci_fill_device_info(const MemoryDeviceState *md, + MemoryDeviceInfo *info) +{ + VirtioPMEMDeviceInfo *vi = g_new0(VirtioPMEMDeviceInfo, 1); + VirtIOPMEMPCI *pci_pmem = VIRTIO_PMEM_PCI(md); + VirtIOPMEM *pmem = VIRTIO_PMEM(&pci_pmem->vdev); + VirtIOPMEMClass *vpc = VIRTIO_PMEM_GET_CLASS(pmem); + DeviceState *dev = DEVICE(md); + + if (dev->id) { + vi->has_id = true; + vi->id = g_strdup(dev->id); + } + + /* let the real device handle everything else */ + vpc->fill_device_info(pmem, vi); + + info->u.virtio_pmem.data = vi; + info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM; +} + +static void virtio_pmem_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass); + + k->realize = virtio_pmem_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; + + mdc->get_addr = virtio_pmem_pci_get_addr; + mdc->set_addr = virtio_pmem_pci_set_addr; + mdc->get_plugged_size = virtio_pmem_pci_get_plugged_size; + mdc->get_memory_region = virtio_pmem_pci_get_memory_region; + mdc->fill_device_info = virtio_pmem_pci_fill_device_info; +} + +static void virtio_pmem_pci_instance_init(Object *obj) +{ + VirtIOPMEMPCI *dev = VIRTIO_PMEM_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_PMEM); +} + +static const VirtioPCIDeviceTypeInfo virtio_pmem_pci_info = { + .base_name = TYPE_VIRTIO_PMEM_PCI, + .generic_name = "virtio-pmem-pci", + .instance_size = sizeof(VirtIOPMEMPCI), + .instance_init = virtio_pmem_pci_instance_init, + .class_init = virtio_pmem_pci_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_MEMORY_DEVICE }, + { } + }, +}; + +static void virtio_pmem_pci_register_types(void) +{ + virtio_pci_types_register(&virtio_pmem_pci_info); +} +type_init(virtio_pmem_pci_register_types) diff --git a/hw/virtio/virtio-pmem-pci.h b/hw/virtio/virtio-pmem-pci.h new file mode 100644 index 00000000..63cfe727 --- /dev/null +++ b/hw/virtio/virtio-pmem-pci.h @@ -0,0 +1,35 @@ +/* + * Virtio PMEM PCI device + * + * Copyright (C) 2018-2019 Red Hat, Inc. + * + * Authors: + * Pankaj Gupta <pagupta@redhat.com> + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_VIRTIO_PMEM_PCI_H +#define QEMU_VIRTIO_PMEM_PCI_H + +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-pmem.h" +#include "qom/object.h" + +typedef struct VirtIOPMEMPCI VirtIOPMEMPCI; + +/* + * virtio-pmem-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_PMEM_PCI "virtio-pmem-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOPMEMPCI, VIRTIO_PMEM_PCI, + TYPE_VIRTIO_PMEM_PCI) + +struct VirtIOPMEMPCI { + VirtIOPCIProxy parent_obj; + VirtIOPMEM vdev; +}; + +#endif /* QEMU_VIRTIO_PMEM_PCI_H */ diff --git a/hw/virtio/virtio-pmem.c b/hw/virtio/virtio-pmem.c new file mode 100644 index 00000000..a1abfe0e --- /dev/null +++ b/hw/virtio/virtio-pmem.c @@ -0,0 +1,196 @@ +/* + * Virtio PMEM device + * + * Copyright (C) 2018-2019 Red Hat, Inc. + * + * Authors: + * Pankaj Gupta <pagupta@redhat.com> + * David Hildenbrand <david@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "hw/virtio/virtio-pmem.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-access.h" +#include "standard-headers/linux/virtio_ids.h" +#include "standard-headers/linux/virtio_pmem.h" +#include "sysemu/hostmem.h" +#include "block/aio.h" +#include "block/thread-pool.h" +#include "trace.h" + +typedef struct VirtIODeviceRequest { + VirtQueueElement elem; + int fd; + VirtIOPMEM *pmem; + VirtIODevice *vdev; + struct virtio_pmem_req req; + struct virtio_pmem_resp resp; +} VirtIODeviceRequest; + +static int worker_cb(void *opaque) +{ + VirtIODeviceRequest *req_data = opaque; + int err = 0; + + /* flush raw backing image */ + err = fsync(req_data->fd); + trace_virtio_pmem_flush_done(err); + if (err != 0) { + err = 1; + } + + virtio_stl_p(req_data->vdev, &req_data->resp.ret, err); + + return 0; +} + +static void done_cb(void *opaque, int ret) +{ + VirtIODeviceRequest *req_data = opaque; + int len = iov_from_buf(req_data->elem.in_sg, req_data->elem.in_num, 0, + &req_data->resp, sizeof(struct virtio_pmem_resp)); + + /* Callbacks are serialized, so no need to use atomic ops. */ + virtqueue_push(req_data->pmem->rq_vq, &req_data->elem, len); + virtio_notify((VirtIODevice *)req_data->pmem, req_data->pmem->rq_vq); + trace_virtio_pmem_response(); + g_free(req_data); +} + +static void virtio_pmem_flush(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIODeviceRequest *req_data; + VirtIOPMEM *pmem = VIRTIO_PMEM(vdev); + HostMemoryBackend *backend = MEMORY_BACKEND(pmem->memdev); + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); + + trace_virtio_pmem_flush_request(); + req_data = virtqueue_pop(vq, sizeof(VirtIODeviceRequest)); + if (!req_data) { + virtio_error(vdev, "virtio-pmem missing request data"); + return; + } + + if (req_data->elem.out_num < 1 || req_data->elem.in_num < 1) { + virtio_error(vdev, "virtio-pmem request not proper"); + virtqueue_detach_element(vq, (VirtQueueElement *)req_data, 0); + g_free(req_data); + return; + } + req_data->fd = memory_region_get_fd(&backend->mr); + req_data->pmem = pmem; + req_data->vdev = vdev; + thread_pool_submit_aio(pool, worker_cb, req_data, done_cb, req_data); +} + +static void virtio_pmem_get_config(VirtIODevice *vdev, uint8_t *config) +{ + VirtIOPMEM *pmem = VIRTIO_PMEM(vdev); + struct virtio_pmem_config *pmemcfg = (struct virtio_pmem_config *) config; + + virtio_stq_p(vdev, &pmemcfg->start, pmem->start); + virtio_stq_p(vdev, &pmemcfg->size, memory_region_size(&pmem->memdev->mr)); +} + +static uint64_t virtio_pmem_get_features(VirtIODevice *vdev, uint64_t features, + Error **errp) +{ + return features; +} + +static void virtio_pmem_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOPMEM *pmem = VIRTIO_PMEM(dev); + + if (!pmem->memdev) { + error_setg(errp, "virtio-pmem memdev not set"); + return; + } + + if (host_memory_backend_is_mapped(pmem->memdev)) { + error_setg(errp, "can't use already busy memdev: %s", + object_get_canonical_path_component(OBJECT(pmem->memdev))); + return; + } + + host_memory_backend_set_mapped(pmem->memdev, true); + virtio_init(vdev, VIRTIO_ID_PMEM, sizeof(struct virtio_pmem_config)); + pmem->rq_vq = virtio_add_queue(vdev, 128, virtio_pmem_flush); +} + +static void virtio_pmem_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIOPMEM *pmem = VIRTIO_PMEM(dev); + + host_memory_backend_set_mapped(pmem->memdev, false); + virtio_delete_queue(pmem->rq_vq); + virtio_cleanup(vdev); +} + +static void virtio_pmem_fill_device_info(const VirtIOPMEM *pmem, + VirtioPMEMDeviceInfo *vi) +{ + vi->memaddr = pmem->start; + vi->size = memory_region_size(&pmem->memdev->mr); + vi->memdev = object_get_canonical_path(OBJECT(pmem->memdev)); +} + +static MemoryRegion *virtio_pmem_get_memory_region(VirtIOPMEM *pmem, + Error **errp) +{ + if (!pmem->memdev) { + error_setg(errp, "'%s' property must be set", VIRTIO_PMEM_MEMDEV_PROP); + return NULL; + } + + return &pmem->memdev->mr; +} + +static Property virtio_pmem_properties[] = { + DEFINE_PROP_UINT64(VIRTIO_PMEM_ADDR_PROP, VirtIOPMEM, start, 0), + DEFINE_PROP_LINK(VIRTIO_PMEM_MEMDEV_PROP, VirtIOPMEM, memdev, + TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_pmem_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + VirtIOPMEMClass *vpc = VIRTIO_PMEM_CLASS(klass); + + device_class_set_props(dc, virtio_pmem_properties); + + vdc->realize = virtio_pmem_realize; + vdc->unrealize = virtio_pmem_unrealize; + vdc->get_config = virtio_pmem_get_config; + vdc->get_features = virtio_pmem_get_features; + + vpc->fill_device_info = virtio_pmem_fill_device_info; + vpc->get_memory_region = virtio_pmem_get_memory_region; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); +} + +static const TypeInfo virtio_pmem_info = { + .name = TYPE_VIRTIO_PMEM, + .parent = TYPE_VIRTIO_DEVICE, + .class_size = sizeof(VirtIOPMEMClass), + .class_init = virtio_pmem_class_init, + .instance_size = sizeof(VirtIOPMEM), +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_pmem_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-rng-pci.c b/hw/virtio/virtio-rng-pci.c new file mode 100644 index 00000000..6e76f8b5 --- /dev/null +++ b/hw/virtio/virtio-rng-pci.c @@ -0,0 +1,96 @@ +/* + * Virtio rng PCI Bindings + * + * Copyright 2012 Red Hat, Inc. + * Copyright 2012 Amit Shah <amit.shah@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/virtio/virtio-pci.h" +#include "hw/virtio/virtio-rng.h" +#include "hw/qdev-properties.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "qom/object.h" + +typedef struct VirtIORngPCI VirtIORngPCI; + +/* + * virtio-rng-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_RNG_PCI "virtio-rng-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIORngPCI, VIRTIO_RNG_PCI, + TYPE_VIRTIO_RNG_PCI) + +struct VirtIORngPCI { + VirtIOPCIProxy parent_obj; + VirtIORNG vdev; +}; + +static Property virtio_rng_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIORngPCI *vrng = VIRTIO_RNG_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&vrng->vdev); + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = 2; + } + + if (!qdev_realize(vdev, BUS(&vpci_dev->bus), errp)) { + return; + } +} + +static void virtio_rng_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + k->realize = virtio_rng_pci_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_RNG; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_OTHERS; + device_class_set_props(dc, virtio_rng_properties); +} + +static void virtio_rng_initfn(Object *obj) +{ + VirtIORngPCI *dev = VIRTIO_RNG_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_RNG); +} + +static const VirtioPCIDeviceTypeInfo virtio_rng_pci_info = { + .base_name = TYPE_VIRTIO_RNG_PCI, + .generic_name = "virtio-rng-pci", + .transitional_name = "virtio-rng-pci-transitional", + .non_transitional_name = "virtio-rng-pci-non-transitional", + .instance_size = sizeof(VirtIORngPCI), + .instance_init = virtio_rng_initfn, + .class_init = virtio_rng_pci_class_init, +}; + +static void virtio_rng_pci_register(void) +{ + virtio_pci_types_register(&virtio_rng_pci_info); +} + +type_init(virtio_rng_pci_register) diff --git a/hw/virtio/virtio-rng.c b/hw/virtio/virtio-rng.c new file mode 100644 index 00000000..7e12fc03 --- /dev/null +++ b/hw/virtio/virtio-rng.c @@ -0,0 +1,289 @@ +/* + * A virtio device implementing a hardware random number generator. + * + * Copyright 2012 Red Hat, Inc. + * Copyright 2012 Amit Shah <amit.shah@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/iov.h" +#include "qemu/module.h" +#include "qemu/timer.h" +#include "hw/virtio/virtio.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-rng.h" +#include "sysemu/rng.h" +#include "sysemu/runstate.h" +#include "qom/object_interfaces.h" +#include "trace.h" + +static bool is_guest_ready(VirtIORNG *vrng) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(vrng); + if (virtio_queue_ready(vrng->vq) + && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + return true; + } + trace_virtio_rng_guest_not_ready(vrng); + return false; +} + +static size_t get_request_size(VirtQueue *vq, unsigned quota) +{ + unsigned int in, out; + + virtqueue_get_avail_bytes(vq, &in, &out, quota, 0); + return in; +} + +static void virtio_rng_process(VirtIORNG *vrng); + +/* Send data from a char device over to the guest */ +static void chr_read(void *opaque, const void *buf, size_t size) +{ + VirtIORNG *vrng = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(vrng); + VirtQueueElement *elem; + size_t len; + int offset; + + if (!is_guest_ready(vrng)) { + return; + } + + /* we can't modify the virtqueue until + * our state is fully synced + */ + + if (!runstate_check(RUN_STATE_RUNNING)) { + trace_virtio_rng_cpu_is_stopped(vrng, size); + return; + } + + vrng->quota_remaining -= size; + + offset = 0; + while (offset < size) { + elem = virtqueue_pop(vrng->vq, sizeof(VirtQueueElement)); + if (!elem) { + break; + } + trace_virtio_rng_popped(vrng); + len = iov_from_buf(elem->in_sg, elem->in_num, + 0, buf + offset, size - offset); + offset += len; + + virtqueue_push(vrng->vq, elem, len); + trace_virtio_rng_pushed(vrng, len); + g_free(elem); + } + virtio_notify(vdev, vrng->vq); + + if (!virtio_queue_empty(vrng->vq)) { + /* If we didn't drain the queue, call virtio_rng_process + * to take care of asking for more data as appropriate. + */ + virtio_rng_process(vrng); + } +} + +static void virtio_rng_process(VirtIORNG *vrng) +{ + size_t size; + unsigned quota; + + if (!is_guest_ready(vrng)) { + return; + } + + if (vrng->activate_timer) { + timer_mod(vrng->rate_limit_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vrng->conf.period_ms); + vrng->activate_timer = false; + } + + if (vrng->quota_remaining < 0) { + quota = 0; + } else { + quota = MIN((uint64_t)vrng->quota_remaining, (uint64_t)UINT32_MAX); + } + size = get_request_size(vrng->vq, quota); + + trace_virtio_rng_request(vrng, size, quota); + + size = MIN(vrng->quota_remaining, size); + if (size) { + rng_backend_request_entropy(vrng->rng, size, chr_read, vrng); + } +} + +static void handle_input(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIORNG *vrng = VIRTIO_RNG(vdev); + virtio_rng_process(vrng); +} + +static uint64_t get_features(VirtIODevice *vdev, uint64_t f, Error **errp) +{ + return f; +} + +static void virtio_rng_vm_state_change(void *opaque, bool running, + RunState state) +{ + VirtIORNG *vrng = opaque; + + trace_virtio_rng_vm_state_change(vrng, running, state); + + /* We may have an element ready but couldn't process it due to a quota + * limit or because CPU was stopped. Make sure to try again when the + * CPU restart. + */ + + if (running && is_guest_ready(vrng)) { + virtio_rng_process(vrng); + } +} + +static void check_rate_limit(void *opaque) +{ + VirtIORNG *vrng = opaque; + + vrng->quota_remaining = vrng->conf.max_bytes; + virtio_rng_process(vrng); + vrng->activate_timer = true; +} + +static void virtio_rng_set_status(VirtIODevice *vdev, uint8_t status) +{ + VirtIORNG *vrng = VIRTIO_RNG(vdev); + + if (!vdev->vm_running) { + return; + } + vdev->status = status; + + /* Something changed, try to process buffers */ + virtio_rng_process(vrng); +} + +static void virtio_rng_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIORNG *vrng = VIRTIO_RNG(dev); + + if (vrng->conf.period_ms <= 0) { + error_setg(errp, "'period' parameter expects a positive integer"); + return; + } + + /* Workaround: Property parsing does not enforce unsigned integers, + * So this is a hack to reject such numbers. */ + if (vrng->conf.max_bytes > INT64_MAX) { + error_setg(errp, "'max-bytes' parameter must be non-negative, " + "and less than 2^63"); + return; + } + + if (vrng->conf.rng == NULL) { + Object *default_backend = object_new(TYPE_RNG_BUILTIN); + + if (!user_creatable_complete(USER_CREATABLE(default_backend), + errp)) { + object_unref(default_backend); + return; + } + + object_property_add_child(OBJECT(dev), "default-backend", + default_backend); + + /* The child property took a reference, we can safely drop ours now */ + object_unref(default_backend); + + object_property_set_link(OBJECT(dev), "rng", default_backend, + &error_abort); + } + + vrng->rng = vrng->conf.rng; + if (vrng->rng == NULL) { + error_setg(errp, "'rng' parameter expects a valid object"); + return; + } + + virtio_init(vdev, VIRTIO_ID_RNG, 0); + + vrng->vq = virtio_add_queue(vdev, 8, handle_input); + vrng->quota_remaining = vrng->conf.max_bytes; + vrng->rate_limit_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + check_rate_limit, vrng); + vrng->activate_timer = true; + + vrng->vmstate = qemu_add_vm_change_state_handler(virtio_rng_vm_state_change, + vrng); +} + +static void virtio_rng_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtIORNG *vrng = VIRTIO_RNG(dev); + + qemu_del_vm_change_state_handler(vrng->vmstate); + timer_free(vrng->rate_limit_timer); + virtio_del_queue(vdev, 0); + virtio_cleanup(vdev); +} + +static const VMStateDescription vmstate_virtio_rng = { + .name = "virtio-rng", + .minimum_version_id = 1, + .version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VIRTIO_DEVICE, + VMSTATE_END_OF_LIST() + }, +}; + +static Property virtio_rng_properties[] = { + /* Set a default rate limit of 2^47 bytes per minute or roughly 2TB/s. If + * you have an entropy source capable of generating more entropy than this + * and you can pass it through via virtio-rng, then hats off to you. Until + * then, this is unlimited for all practical purposes. + */ + DEFINE_PROP_UINT64("max-bytes", VirtIORNG, conf.max_bytes, INT64_MAX), + DEFINE_PROP_UINT32("period", VirtIORNG, conf.period_ms, 1 << 16), + DEFINE_PROP_LINK("rng", VirtIORNG, conf.rng, TYPE_RNG_BACKEND, RngBackend *), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_rng_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + + device_class_set_props(dc, virtio_rng_properties); + dc->vmsd = &vmstate_virtio_rng; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + vdc->realize = virtio_rng_device_realize; + vdc->unrealize = virtio_rng_device_unrealize; + vdc->get_features = get_features; + vdc->set_status = virtio_rng_set_status; +} + +static const TypeInfo virtio_rng_info = { + .name = TYPE_VIRTIO_RNG, + .parent = TYPE_VIRTIO_DEVICE, + .instance_size = sizeof(VirtIORNG), + .class_init = virtio_rng_class_init, +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_rng_info); +} + +type_init(virtio_register_types) diff --git a/hw/virtio/virtio-scsi-pci.c b/hw/virtio/virtio-scsi-pci.c new file mode 100644 index 00000000..e8e3442f --- /dev/null +++ b/hw/virtio/virtio-scsi-pci.c @@ -0,0 +1,114 @@ +/* + * Virtio scsi PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ + +#include "qemu/osdep.h" + +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-scsi.h" +#include "qemu/module.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VirtIOSCSIPCI VirtIOSCSIPCI; + +/* + * virtio-scsi-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_SCSI_PCI "virtio-scsi-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOSCSIPCI, VIRTIO_SCSI_PCI, + TYPE_VIRTIO_SCSI_PCI) + +struct VirtIOSCSIPCI { + VirtIOPCIProxy parent_obj; + VirtIOSCSI vdev; +}; + +static Property virtio_scsi_pci_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, + DEV_NVECTORS_UNSPECIFIED), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_scsi_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + DeviceState *proxy = DEVICE(vpci_dev); + VirtIOSCSIConf *conf = &dev->vdev.parent_obj.conf; + char *bus_name; + + if (conf->num_queues == VIRTIO_SCSI_AUTO_NUM_QUEUES) { + conf->num_queues = + virtio_pci_optimal_num_queues(VIRTIO_SCSI_VQ_NUM_FIXED); + } + + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = conf->num_queues + VIRTIO_SCSI_VQ_NUM_FIXED + 1; + } + + /* + * For command line compatibility, this sets the virtio-scsi-device bus + * name as before. + */ + if (proxy->id) { + bus_name = g_strdup_printf("%s.0", proxy->id); + virtio_device_set_child_bus_name(VIRTIO_DEVICE(vdev), bus_name); + g_free(bus_name); + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static void virtio_scsi_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + + k->realize = virtio_scsi_pci_realize; + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + device_class_set_props(dc, virtio_scsi_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_SCSI; + pcidev_k->revision = 0x00; + pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI; +} + +static void virtio_scsi_pci_instance_init(Object *obj) +{ + VirtIOSCSIPCI *dev = VIRTIO_SCSI_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_SCSI); +} + +static const VirtioPCIDeviceTypeInfo virtio_scsi_pci_info = { + .base_name = TYPE_VIRTIO_SCSI_PCI, + .generic_name = "virtio-scsi-pci", + .transitional_name = "virtio-scsi-pci-transitional", + .non_transitional_name = "virtio-scsi-pci-non-transitional", + .instance_size = sizeof(VirtIOSCSIPCI), + .instance_init = virtio_scsi_pci_instance_init, + .class_init = virtio_scsi_pci_class_init, +}; + +static void virtio_scsi_pci_register(void) +{ + virtio_pci_types_register(&virtio_scsi_pci_info); +} + +type_init(virtio_scsi_pci_register) diff --git a/hw/virtio/virtio-serial-pci.c b/hw/virtio/virtio-serial-pci.c new file mode 100644 index 00000000..cea31adc --- /dev/null +++ b/hw/virtio/virtio-serial-pci.c @@ -0,0 +1,117 @@ +/* + * Virtio serial PCI Bindings + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2009 CodeSourcery + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paul Brook <paul@codesourcery.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" + +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-serial.h" +#include "qemu/module.h" +#include "hw/virtio/virtio-pci.h" +#include "qom/object.h" + +typedef struct VirtIOSerialPCI VirtIOSerialPCI; + +/* + * virtio-serial-pci: This extends VirtioPCIProxy. + */ +#define TYPE_VIRTIO_SERIAL_PCI "virtio-serial-pci-base" +DECLARE_INSTANCE_CHECKER(VirtIOSerialPCI, VIRTIO_SERIAL_PCI, + TYPE_VIRTIO_SERIAL_PCI) + +struct VirtIOSerialPCI { + VirtIOPCIProxy parent_obj; + VirtIOSerial vdev; +}; + +static void virtio_serial_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) +{ + VirtIOSerialPCI *dev = VIRTIO_SERIAL_PCI(vpci_dev); + DeviceState *vdev = DEVICE(&dev->vdev); + DeviceState *proxy = DEVICE(vpci_dev); + char *bus_name; + + if (vpci_dev->class_code != PCI_CLASS_COMMUNICATION_OTHER && + vpci_dev->class_code != PCI_CLASS_DISPLAY_OTHER && /* qemu 0.10 */ + vpci_dev->class_code != PCI_CLASS_OTHERS) { /* qemu-kvm */ + vpci_dev->class_code = PCI_CLASS_COMMUNICATION_OTHER; + } + + /* backwards-compatibility with machines that were created with + DEV_NVECTORS_UNSPECIFIED */ + if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) { + vpci_dev->nvectors = dev->vdev.serial.max_virtserial_ports + 1; + } + + /* + * For command line compatibility, this sets the virtio-serial-device bus + * name as before. + */ + if (proxy->id) { + bus_name = g_strdup_printf("%s.0", proxy->id); + virtio_device_set_child_bus_name(VIRTIO_DEVICE(vdev), bus_name); + g_free(bus_name); + } + + qdev_realize(vdev, BUS(&vpci_dev->bus), errp); +} + +static Property virtio_serial_pci_properties[] = { + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), + DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), + DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void virtio_serial_pci_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass); + PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass); + k->realize = virtio_serial_pci_realize; + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + device_class_set_props(dc, virtio_serial_pci_properties); + pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; + pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_CONSOLE; + pcidev_k->revision = VIRTIO_PCI_ABI_VERSION; + pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER; +} + +static void virtio_serial_pci_instance_init(Object *obj) +{ + VirtIOSerialPCI *dev = VIRTIO_SERIAL_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_SERIAL); +} + +static const VirtioPCIDeviceTypeInfo virtio_serial_pci_info = { + .base_name = TYPE_VIRTIO_SERIAL_PCI, + .generic_name = "virtio-serial-pci", + .transitional_name = "virtio-serial-pci-transitional", + .non_transitional_name = "virtio-serial-pci-non-transitional", + .instance_size = sizeof(VirtIOSerialPCI), + .instance_init = virtio_serial_pci_instance_init, + .class_init = virtio_serial_pci_class_init, +}; + +static void virtio_serial_pci_register(void) +{ + virtio_pci_types_register(&virtio_serial_pci_info); +} + +type_init(virtio_serial_pci_register) diff --git a/hw/virtio/virtio-stub.c b/hw/virtio/virtio-stub.c new file mode 100644 index 00000000..7ddb22cc --- /dev/null +++ b/hw/virtio/virtio-stub.c @@ -0,0 +1,42 @@ +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-virtio.h" + +static void *qmp_virtio_unsupported(Error **errp) +{ + error_setg(errp, "Virtio is disabled"); + return NULL; +} + +VirtioInfoList *qmp_x_query_virtio(Error **errp) +{ + return qmp_virtio_unsupported(errp); +} + +VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) +{ + return qmp_virtio_unsupported(errp); +} + +VirtVhostQueueStatus *qmp_x_query_virtio_vhost_queue_status(const char *path, + uint16_t queue, + Error **errp) +{ + return qmp_virtio_unsupported(errp); +} + +VirtQueueStatus *qmp_x_query_virtio_queue_status(const char *path, + uint16_t queue, + Error **errp) +{ + return qmp_virtio_unsupported(errp); +} + +VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path, + uint16_t queue, + bool has_index, + uint16_t index, + Error **errp) +{ + return qmp_virtio_unsupported(errp); +} diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c new file mode 100644 index 00000000..eb6347ab --- /dev/null +++ b/hw/virtio/virtio.c @@ -0,0 +1,4991 @@ +/* + * Virtio Support + * + * Copyright IBM, Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qapi-commands-virtio.h" +#include "qapi/qapi-commands-qom.h" +#include "qapi/qapi-visit-virtio.h" +#include "qapi/qmp/qjson.h" +#include "cpu.h" +#include "trace.h" +#include "qemu/error-report.h" +#include "qemu/log.h" +#include "qemu/main-loop.h" +#include "qemu/module.h" +#include "qom/object_interfaces.h" +#include "hw/virtio/virtio.h" +#include "migration/qemu-file-types.h" +#include "qemu/atomic.h" +#include "hw/virtio/virtio-bus.h" +#include "hw/qdev-properties.h" +#include "hw/virtio/virtio-access.h" +#include "sysemu/dma.h" +#include "sysemu/runstate.h" +#include "standard-headers/linux/virtio_ids.h" +#include "standard-headers/linux/vhost_types.h" +#include "standard-headers/linux/virtio_blk.h" +#include "standard-headers/linux/virtio_console.h" +#include "standard-headers/linux/virtio_gpu.h" +#include "standard-headers/linux/virtio_net.h" +#include "standard-headers/linux/virtio_scsi.h" +#include "standard-headers/linux/virtio_i2c.h" +#include "standard-headers/linux/virtio_balloon.h" +#include "standard-headers/linux/virtio_iommu.h" +#include "standard-headers/linux/virtio_mem.h" +#include "standard-headers/linux/virtio_vsock.h" +#include CONFIG_DEVICES + +/* QAPI list of realized VirtIODevices */ +static QTAILQ_HEAD(, VirtIODevice) virtio_list; + +/* + * Maximum size of virtio device config space + */ +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +#define FEATURE_ENTRY(name, desc) (qmp_virtio_feature_map_t) \ + { .virtio_bit = name, .feature_desc = desc } + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + VHOST_USER_PROTOCOL_F_REPLY_ACK = 3, + VHOST_USER_PROTOCOL_F_NET_MTU = 4, + VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5, + VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, + VHOST_USER_PROTOCOL_F_CONFIG = 9, + VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, + VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, + VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, + VHOST_USER_PROTOCOL_F_RESET_DEVICE = 13, + VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, + VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + VHOST_USER_PROTOCOL_F_MAX +}; + +/* Virtio transport features mapping */ +static qmp_virtio_feature_map_t virtio_transport_map[] = { + /* Virtio device transport features */ +#ifndef VIRTIO_CONFIG_NO_LEGACY + FEATURE_ENTRY(VIRTIO_F_NOTIFY_ON_EMPTY, \ + "VIRTIO_F_NOTIFY_ON_EMPTY: Notify when device runs out of avail. " + "descs. on VQ"), + FEATURE_ENTRY(VIRTIO_F_ANY_LAYOUT, \ + "VIRTIO_F_ANY_LAYOUT: Device accepts arbitrary desc. layouts"), +#endif /* !VIRTIO_CONFIG_NO_LEGACY */ + FEATURE_ENTRY(VIRTIO_F_VERSION_1, \ + "VIRTIO_F_VERSION_1: Device compliant for v1 spec (legacy)"), + FEATURE_ENTRY(VIRTIO_F_IOMMU_PLATFORM, \ + "VIRTIO_F_IOMMU_PLATFORM: Device can be used on IOMMU platform"), + FEATURE_ENTRY(VIRTIO_F_RING_PACKED, \ + "VIRTIO_F_RING_PACKED: Device supports packed VQ layout"), + FEATURE_ENTRY(VIRTIO_F_IN_ORDER, \ + "VIRTIO_F_IN_ORDER: Device uses buffers in same order as made " + "available by driver"), + FEATURE_ENTRY(VIRTIO_F_ORDER_PLATFORM, \ + "VIRTIO_F_ORDER_PLATFORM: Memory accesses ordered by platform"), + FEATURE_ENTRY(VIRTIO_F_SR_IOV, \ + "VIRTIO_F_SR_IOV: Device supports single root I/O virtualization"), + /* Virtio ring transport features */ + FEATURE_ENTRY(VIRTIO_RING_F_INDIRECT_DESC, \ + "VIRTIO_RING_F_INDIRECT_DESC: Indirect descriptors supported"), + FEATURE_ENTRY(VIRTIO_RING_F_EVENT_IDX, \ + "VIRTIO_RING_F_EVENT_IDX: Used & avail. event fields enabled"), + { -1, "" } +}; + +/* Vhost-user protocol features mapping */ +static qmp_virtio_feature_map_t vhost_user_protocol_map[] = { + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_MQ, \ + "VHOST_USER_PROTOCOL_F_MQ: Multiqueue protocol supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_LOG_SHMFD, \ + "VHOST_USER_PROTOCOL_F_LOG_SHMFD: Shared log memory fd supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_RARP, \ + "VHOST_USER_PROTOCOL_F_RARP: Vhost-user back-end RARP broadcasting " + "supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_REPLY_ACK, \ + "VHOST_USER_PROTOCOL_F_REPLY_ACK: Requested operation status ack. " + "supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_NET_MTU, \ + "VHOST_USER_PROTOCOL_F_NET_MTU: Expose host MTU to guest supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_SLAVE_REQ, \ + "VHOST_USER_PROTOCOL_F_SLAVE_REQ: Socket fd for back-end initiated " + "requests supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CROSS_ENDIAN, \ + "VHOST_USER_PROTOCOL_F_CROSS_ENDIAN: Endianness of VQs for legacy " + "devices supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CRYPTO_SESSION, \ + "VHOST_USER_PROTOCOL_F_CRYPTO_SESSION: Session creation for crypto " + "operations supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_PAGEFAULT, \ + "VHOST_USER_PROTOCOL_F_PAGEFAULT: Request servicing on userfaultfd " + "for accessed pages supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CONFIG, \ + "VHOST_USER_PROTOCOL_F_CONFIG: Vhost-user messaging for virtio " + "device configuration space supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD, \ + "VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD: Slave fd communication " + "channel supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_HOST_NOTIFIER, \ + "VHOST_USER_PROTOCOL_F_HOST_NOTIFIER: Host notifiers for specified " + "VQs supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD, \ + "VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD: Shared inflight I/O buffers " + "supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_RESET_DEVICE, \ + "VHOST_USER_PROTOCOL_F_RESET_DEVICE: Disabling all rings and " + "resetting internal device state supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS, \ + "VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS: In-band messaging " + "supported"), + FEATURE_ENTRY(VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS, \ + "VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS: Configuration for " + "memory slots supported"), + { -1, "" } +}; + +/* virtio device configuration statuses */ +static qmp_virtio_feature_map_t virtio_config_status_map[] = { + FEATURE_ENTRY(VIRTIO_CONFIG_S_DRIVER_OK, \ + "VIRTIO_CONFIG_S_DRIVER_OK: Driver setup and ready"), + FEATURE_ENTRY(VIRTIO_CONFIG_S_FEATURES_OK, \ + "VIRTIO_CONFIG_S_FEATURES_OK: Feature negotiation complete"), + FEATURE_ENTRY(VIRTIO_CONFIG_S_DRIVER, \ + "VIRTIO_CONFIG_S_DRIVER: Guest OS compatible with device"), + FEATURE_ENTRY(VIRTIO_CONFIG_S_NEEDS_RESET, \ + "VIRTIO_CONFIG_S_NEEDS_RESET: Irrecoverable error, device needs " + "reset"), + FEATURE_ENTRY(VIRTIO_CONFIG_S_FAILED, \ + "VIRTIO_CONFIG_S_FAILED: Error in guest, device failed"), + FEATURE_ENTRY(VIRTIO_CONFIG_S_ACKNOWLEDGE, \ + "VIRTIO_CONFIG_S_ACKNOWLEDGE: Valid virtio device found"), + { -1, "" } +}; + +/* virtio-blk features mapping */ +qmp_virtio_feature_map_t virtio_blk_feature_map[] = { + FEATURE_ENTRY(VIRTIO_BLK_F_SIZE_MAX, \ + "VIRTIO_BLK_F_SIZE_MAX: Max segment size is size_max"), + FEATURE_ENTRY(VIRTIO_BLK_F_SEG_MAX, \ + "VIRTIO_BLK_F_SEG_MAX: Max segments in a request is seg_max"), + FEATURE_ENTRY(VIRTIO_BLK_F_GEOMETRY, \ + "VIRTIO_BLK_F_GEOMETRY: Legacy geometry available"), + FEATURE_ENTRY(VIRTIO_BLK_F_RO, \ + "VIRTIO_BLK_F_RO: Device is read-only"), + FEATURE_ENTRY(VIRTIO_BLK_F_BLK_SIZE, \ + "VIRTIO_BLK_F_BLK_SIZE: Block size of disk available"), + FEATURE_ENTRY(VIRTIO_BLK_F_TOPOLOGY, \ + "VIRTIO_BLK_F_TOPOLOGY: Topology information available"), + FEATURE_ENTRY(VIRTIO_BLK_F_MQ, \ + "VIRTIO_BLK_F_MQ: Multiqueue supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_DISCARD, \ + "VIRTIO_BLK_F_DISCARD: Discard command supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \ + "VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"), +#ifndef VIRTIO_BLK_NO_LEGACY + FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \ + "VIRTIO_BLK_F_BARRIER: Request barriers supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_SCSI, \ + "VIRTIO_BLK_F_SCSI: SCSI packet commands supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_FLUSH, \ + "VIRTIO_BLK_F_FLUSH: Flush command supported"), + FEATURE_ENTRY(VIRTIO_BLK_F_CONFIG_WCE, \ + "VIRTIO_BLK_F_CONFIG_WCE: Cache writeback and writethrough modes " + "supported"), +#endif /* !VIRTIO_BLK_NO_LEGACY */ + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio-serial features mapping */ +qmp_virtio_feature_map_t virtio_serial_feature_map[] = { + FEATURE_ENTRY(VIRTIO_CONSOLE_F_SIZE, \ + "VIRTIO_CONSOLE_F_SIZE: Host providing console size"), + FEATURE_ENTRY(VIRTIO_CONSOLE_F_MULTIPORT, \ + "VIRTIO_CONSOLE_F_MULTIPORT: Multiple ports for device supported"), + FEATURE_ENTRY(VIRTIO_CONSOLE_F_EMERG_WRITE, \ + "VIRTIO_CONSOLE_F_EMERG_WRITE: Emergency write supported"), + { -1, "" } +}; + +/* virtio-gpu features mapping */ +qmp_virtio_feature_map_t virtio_gpu_feature_map[] = { + FEATURE_ENTRY(VIRTIO_GPU_F_VIRGL, \ + "VIRTIO_GPU_F_VIRGL: Virgl 3D mode supported"), + FEATURE_ENTRY(VIRTIO_GPU_F_EDID, \ + "VIRTIO_GPU_F_EDID: EDID metadata supported"), + FEATURE_ENTRY(VIRTIO_GPU_F_RESOURCE_UUID, \ + "VIRTIO_GPU_F_RESOURCE_UUID: Resource UUID assigning supported"), + FEATURE_ENTRY(VIRTIO_GPU_F_RESOURCE_BLOB, \ + "VIRTIO_GPU_F_RESOURCE_BLOB: Size-based blob resources supported"), + FEATURE_ENTRY(VIRTIO_GPU_F_CONTEXT_INIT, \ + "VIRTIO_GPU_F_CONTEXT_INIT: Context types and synchronization " + "timelines supported"), + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio-input features mapping */ +qmp_virtio_feature_map_t virtio_input_feature_map[] = { + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio-net features mapping */ +qmp_virtio_feature_map_t virtio_net_feature_map[] = { + FEATURE_ENTRY(VIRTIO_NET_F_CSUM, \ + "VIRTIO_NET_F_CSUM: Device handling packets with partial checksum " + "supported"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_CSUM, \ + "VIRTIO_NET_F_GUEST_CSUM: Driver handling packets with partial " + "checksum supported"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ + "VIRTIO_NET_F_CTRL_GUEST_OFFLOADS: Control channel offloading " + "reconfig. supported"), + FEATURE_ENTRY(VIRTIO_NET_F_MTU, \ + "VIRTIO_NET_F_MTU: Device max MTU reporting supported"), + FEATURE_ENTRY(VIRTIO_NET_F_MAC, \ + "VIRTIO_NET_F_MAC: Device has given MAC address"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_TSO4, \ + "VIRTIO_NET_F_GUEST_TSO4: Driver can receive TSOv4"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_TSO6, \ + "VIRTIO_NET_F_GUEST_TSO6: Driver can receive TSOv6"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_ECN, \ + "VIRTIO_NET_F_GUEST_ECN: Driver can receive TSO with ECN"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_UFO, \ + "VIRTIO_NET_F_GUEST_UFO: Driver can receive UFO"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_TSO4, \ + "VIRTIO_NET_F_HOST_TSO4: Device can receive TSOv4"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_TSO6, \ + "VIRTIO_NET_F_HOST_TSO6: Device can receive TSOv6"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_ECN, \ + "VIRTIO_NET_F_HOST_ECN: Device can receive TSO with ECN"), + FEATURE_ENTRY(VIRTIO_NET_F_HOST_UFO, \ + "VIRTIO_NET_F_HOST_UFO: Device can receive UFO"), + FEATURE_ENTRY(VIRTIO_NET_F_MRG_RXBUF, \ + "VIRTIO_NET_F_MRG_RXBUF: Driver can merge receive buffers"), + FEATURE_ENTRY(VIRTIO_NET_F_STATUS, \ + "VIRTIO_NET_F_STATUS: Configuration status field available"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_VQ, \ + "VIRTIO_NET_F_CTRL_VQ: Control channel available"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_RX, \ + "VIRTIO_NET_F_CTRL_RX: Control channel RX mode supported"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_VLAN, \ + "VIRTIO_NET_F_CTRL_VLAN: Control channel VLAN filtering supported"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_RX_EXTRA, \ + "VIRTIO_NET_F_CTRL_RX_EXTRA: Extra RX mode control supported"), + FEATURE_ENTRY(VIRTIO_NET_F_GUEST_ANNOUNCE, \ + "VIRTIO_NET_F_GUEST_ANNOUNCE: Driver sending gratuitous packets " + "supported"), + FEATURE_ENTRY(VIRTIO_NET_F_MQ, \ + "VIRTIO_NET_F_MQ: Multiqueue with automatic receive steering " + "supported"), + FEATURE_ENTRY(VIRTIO_NET_F_CTRL_MAC_ADDR, \ + "VIRTIO_NET_F_CTRL_MAC_ADDR: MAC address set through control " + "channel"), + FEATURE_ENTRY(VIRTIO_NET_F_HASH_REPORT, \ + "VIRTIO_NET_F_HASH_REPORT: Hash reporting supported"), + FEATURE_ENTRY(VIRTIO_NET_F_RSS, \ + "VIRTIO_NET_F_RSS: RSS RX steering supported"), + FEATURE_ENTRY(VIRTIO_NET_F_RSC_EXT, \ + "VIRTIO_NET_F_RSC_EXT: Extended coalescing info supported"), + FEATURE_ENTRY(VIRTIO_NET_F_STANDBY, \ + "VIRTIO_NET_F_STANDBY: Device acting as standby for primary " + "device with same MAC addr. supported"), + FEATURE_ENTRY(VIRTIO_NET_F_SPEED_DUPLEX, \ + "VIRTIO_NET_F_SPEED_DUPLEX: Device set linkspeed and duplex"), +#ifndef VIRTIO_NET_NO_LEGACY + FEATURE_ENTRY(VIRTIO_NET_F_GSO, \ + "VIRTIO_NET_F_GSO: Handling GSO-type packets supported"), +#endif /* !VIRTIO_NET_NO_LEGACY */ + FEATURE_ENTRY(VHOST_NET_F_VIRTIO_NET_HDR, \ + "VHOST_NET_F_VIRTIO_NET_HDR: Virtio-net headers for RX and TX " + "packets supported"), + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio-scsi features mapping */ +qmp_virtio_feature_map_t virtio_scsi_feature_map[] = { + FEATURE_ENTRY(VIRTIO_SCSI_F_INOUT, \ + "VIRTIO_SCSI_F_INOUT: Requests including read and writable data " + "buffers suppoted"), + FEATURE_ENTRY(VIRTIO_SCSI_F_HOTPLUG, \ + "VIRTIO_SCSI_F_HOTPLUG: Reporting and handling hot-plug events " + "supported"), + FEATURE_ENTRY(VIRTIO_SCSI_F_CHANGE, \ + "VIRTIO_SCSI_F_CHANGE: Reporting and handling LUN changes " + "supported"), + FEATURE_ENTRY(VIRTIO_SCSI_F_T10_PI, \ + "VIRTIO_SCSI_F_T10_PI: T10 info included in request header"), + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio/vhost-user-fs features mapping */ +qmp_virtio_feature_map_t virtio_fs_feature_map[] = { + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio/vhost-user-i2c features mapping */ +qmp_virtio_feature_map_t virtio_i2c_feature_map[] = { + FEATURE_ENTRY(VIRTIO_I2C_F_ZERO_LENGTH_REQUEST, \ + "VIRTIO_I2C_F_ZERO_LEGNTH_REQUEST: Zero length requests supported"), + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio/vhost-vsock features mapping */ +qmp_virtio_feature_map_t virtio_vsock_feature_map[] = { + FEATURE_ENTRY(VIRTIO_VSOCK_F_SEQPACKET, \ + "VIRTIO_VSOCK_F_SEQPACKET: SOCK_SEQPACKET supported"), + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* virtio-balloon features mapping */ +qmp_virtio_feature_map_t virtio_balloon_feature_map[] = { + FEATURE_ENTRY(VIRTIO_BALLOON_F_MUST_TELL_HOST, \ + "VIRTIO_BALLOON_F_MUST_TELL_HOST: Tell host before reclaiming " + "pages"), + FEATURE_ENTRY(VIRTIO_BALLOON_F_STATS_VQ, \ + "VIRTIO_BALLOON_F_STATS_VQ: Guest memory stats VQ available"), + FEATURE_ENTRY(VIRTIO_BALLOON_F_DEFLATE_ON_OOM, \ + "VIRTIO_BALLOON_F_DEFLATE_ON_OOM: Deflate balloon when guest OOM"), + FEATURE_ENTRY(VIRTIO_BALLOON_F_FREE_PAGE_HINT, \ + "VIRTIO_BALLOON_F_FREE_PAGE_HINT: VQ reporting free pages enabled"), + FEATURE_ENTRY(VIRTIO_BALLOON_F_PAGE_POISON, \ + "VIRTIO_BALLOON_F_PAGE_POISON: Guest page poisoning enabled"), + FEATURE_ENTRY(VIRTIO_BALLOON_F_REPORTING, \ + "VIRTIO_BALLOON_F_REPORTING: Page reporting VQ enabled"), + { -1, "" } +}; + +/* virtio-crypto features mapping */ +qmp_virtio_feature_map_t virtio_crypto_feature_map[] = { + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + { -1, "" } +}; + +/* virtio-iommu features mapping */ +qmp_virtio_feature_map_t virtio_iommu_feature_map[] = { + FEATURE_ENTRY(VIRTIO_IOMMU_F_INPUT_RANGE, \ + "VIRTIO_IOMMU_F_INPUT_RANGE: Range of available virtual addrs. " + "available"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_DOMAIN_RANGE, \ + "VIRTIO_IOMMU_F_DOMAIN_RANGE: Number of supported domains " + "available"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_MAP_UNMAP, \ + "VIRTIO_IOMMU_F_MAP_UNMAP: Map and unmap requests available"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_BYPASS, \ + "VIRTIO_IOMMU_F_BYPASS: Endpoints not attached to domains are in " + "bypass mode"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_PROBE, \ + "VIRTIO_IOMMU_F_PROBE: Probe requests available"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_MMIO, \ + "VIRTIO_IOMMU_F_MMIO: VIRTIO_IOMMU_MAP_F_MMIO flag available"), + FEATURE_ENTRY(VIRTIO_IOMMU_F_BYPASS_CONFIG, \ + "VIRTIO_IOMMU_F_BYPASS_CONFIG: Bypass field of IOMMU config " + "available"), + { -1, "" } +}; + +/* virtio-mem features mapping */ +qmp_virtio_feature_map_t virtio_mem_feature_map[] = { +#ifndef CONFIG_ACPI + FEATURE_ENTRY(VIRTIO_MEM_F_ACPI_PXM, \ + "VIRTIO_MEM_F_ACPI_PXM: node_id is an ACPI PXM and is valid"), +#endif /* !CONFIG_ACPI */ + FEATURE_ENTRY(VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE, \ + "VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE: Unplugged memory cannot be " + "accessed"), + { -1, "" } +}; + +/* virtio-rng features mapping */ +qmp_virtio_feature_map_t virtio_rng_feature_map[] = { + FEATURE_ENTRY(VHOST_F_LOG_ALL, \ + "VHOST_F_LOG_ALL: Logging write descriptors supported"), + FEATURE_ENTRY(VHOST_USER_F_PROTOCOL_FEATURES, \ + "VHOST_USER_F_PROTOCOL_FEATURES: Vhost-user protocol features " + "negotiation supported"), + { -1, "" } +}; + +/* + * The alignment to use between consumer and producer parts of vring. + * x86 pagesize again. This is the default, used by transports like PCI + * which don't provide a means for the guest to tell the host the alignment. + */ +#define VIRTIO_PCI_VRING_ALIGN 4096 + +typedef struct VRingDesc +{ + uint64_t addr; + uint32_t len; + uint16_t flags; + uint16_t next; +} VRingDesc; + +typedef struct VRingPackedDesc { + uint64_t addr; + uint32_t len; + uint16_t id; + uint16_t flags; +} VRingPackedDesc; + +typedef struct VRingAvail +{ + uint16_t flags; + uint16_t idx; + uint16_t ring[]; +} VRingAvail; + +typedef struct VRingUsedElem +{ + uint32_t id; + uint32_t len; +} VRingUsedElem; + +typedef struct VRingUsed +{ + uint16_t flags; + uint16_t idx; + VRingUsedElem ring[]; +} VRingUsed; + +typedef struct VRingMemoryRegionCaches { + struct rcu_head rcu; + MemoryRegionCache desc; + MemoryRegionCache avail; + MemoryRegionCache used; +} VRingMemoryRegionCaches; + +typedef struct VRing +{ + unsigned int num; + unsigned int num_default; + unsigned int align; + hwaddr desc; + hwaddr avail; + hwaddr used; + VRingMemoryRegionCaches *caches; +} VRing; + +typedef struct VRingPackedDescEvent { + uint16_t off_wrap; + uint16_t flags; +} VRingPackedDescEvent ; + +struct VirtQueue +{ + VRing vring; + VirtQueueElement *used_elems; + + /* Next head to pop */ + uint16_t last_avail_idx; + bool last_avail_wrap_counter; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + bool shadow_avail_wrap_counter; + + uint16_t used_idx; + bool used_wrap_counter; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Notification enabled? */ + bool notification; + + uint16_t queue_index; + + unsigned int inuse; + + uint16_t vector; + VirtIOHandleOutput handle_output; + VirtIODevice *vdev; + EventNotifier guest_notifier; + EventNotifier host_notifier; + bool host_notifier_enabled; + QLIST_ENTRY(VirtQueue) node; +}; + +const char *virtio_device_names[] = { + [VIRTIO_ID_NET] = "virtio-net", + [VIRTIO_ID_BLOCK] = "virtio-blk", + [VIRTIO_ID_CONSOLE] = "virtio-serial", + [VIRTIO_ID_RNG] = "virtio-rng", + [VIRTIO_ID_BALLOON] = "virtio-balloon", + [VIRTIO_ID_IOMEM] = "virtio-iomem", + [VIRTIO_ID_RPMSG] = "virtio-rpmsg", + [VIRTIO_ID_SCSI] = "virtio-scsi", + [VIRTIO_ID_9P] = "virtio-9p", + [VIRTIO_ID_MAC80211_WLAN] = "virtio-mac-wlan", + [VIRTIO_ID_RPROC_SERIAL] = "virtio-rproc-serial", + [VIRTIO_ID_CAIF] = "virtio-caif", + [VIRTIO_ID_MEMORY_BALLOON] = "virtio-mem-balloon", + [VIRTIO_ID_GPU] = "virtio-gpu", + [VIRTIO_ID_CLOCK] = "virtio-clk", + [VIRTIO_ID_INPUT] = "virtio-input", + [VIRTIO_ID_VSOCK] = "vhost-vsock", + [VIRTIO_ID_CRYPTO] = "virtio-crypto", + [VIRTIO_ID_SIGNAL_DIST] = "virtio-signal", + [VIRTIO_ID_PSTORE] = "virtio-pstore", + [VIRTIO_ID_IOMMU] = "virtio-iommu", + [VIRTIO_ID_MEM] = "virtio-mem", + [VIRTIO_ID_SOUND] = "virtio-sound", + [VIRTIO_ID_FS] = "virtio-user-fs", + [VIRTIO_ID_PMEM] = "virtio-pmem", + [VIRTIO_ID_RPMB] = "virtio-rpmb", + [VIRTIO_ID_MAC80211_HWSIM] = "virtio-mac-hwsim", + [VIRTIO_ID_VIDEO_ENCODER] = "virtio-vid-encoder", + [VIRTIO_ID_VIDEO_DECODER] = "virtio-vid-decoder", + [VIRTIO_ID_SCMI] = "virtio-scmi", + [VIRTIO_ID_NITRO_SEC_MOD] = "virtio-nitro-sec-mod", + [VIRTIO_ID_I2C_ADAPTER] = "vhost-user-i2c", + [VIRTIO_ID_WATCHDOG] = "virtio-watchdog", + [VIRTIO_ID_CAN] = "virtio-can", + [VIRTIO_ID_DMABUF] = "virtio-dmabuf", + [VIRTIO_ID_PARAM_SERV] = "virtio-param-serv", + [VIRTIO_ID_AUDIO_POLICY] = "virtio-audio-pol", + [VIRTIO_ID_BT] = "virtio-bluetooth", + [VIRTIO_ID_GPIO] = "virtio-gpio" +}; + +static const char *virtio_id_to_name(uint16_t device_id) +{ + assert(device_id < G_N_ELEMENTS(virtio_device_names)); + const char *name = virtio_device_names[device_id]; + assert(name != NULL); + return name; +} + +/* Called within call_rcu(). */ +static void virtio_free_region_cache(VRingMemoryRegionCaches *caches) +{ + assert(caches != NULL); + address_space_cache_destroy(&caches->desc); + address_space_cache_destroy(&caches->avail); + address_space_cache_destroy(&caches->used); + g_free(caches); +} + +static void virtio_virtqueue_reset_region_cache(struct VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches; + + caches = qatomic_read(&vq->vring.caches); + qatomic_rcu_set(&vq->vring.caches, NULL); + if (caches) { + call_rcu(caches, virtio_free_region_cache, rcu); + } +} + +static void virtio_init_region_cache(VirtIODevice *vdev, int n) +{ + VirtQueue *vq = &vdev->vq[n]; + VRingMemoryRegionCaches *old = vq->vring.caches; + VRingMemoryRegionCaches *new = NULL; + hwaddr addr, size; + int64_t len; + bool packed; + + + addr = vq->vring.desc; + if (!addr) { + goto out_no_cache; + } + new = g_new0(VRingMemoryRegionCaches, 1); + size = virtio_queue_get_desc_size(vdev, n); + packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ? + true : false; + len = address_space_cache_init(&new->desc, vdev->dma_as, + addr, size, packed); + if (len < size) { + virtio_error(vdev, "Cannot map desc"); + goto err_desc; + } + + size = virtio_queue_get_used_size(vdev, n); + len = address_space_cache_init(&new->used, vdev->dma_as, + vq->vring.used, size, true); + if (len < size) { + virtio_error(vdev, "Cannot map used"); + goto err_used; + } + + size = virtio_queue_get_avail_size(vdev, n); + len = address_space_cache_init(&new->avail, vdev->dma_as, + vq->vring.avail, size, false); + if (len < size) { + virtio_error(vdev, "Cannot map avail"); + goto err_avail; + } + + qatomic_rcu_set(&vq->vring.caches, new); + if (old) { + call_rcu(old, virtio_free_region_cache, rcu); + } + return; + +err_avail: + address_space_cache_destroy(&new->avail); +err_used: + address_space_cache_destroy(&new->used); +err_desc: + address_space_cache_destroy(&new->desc); +out_no_cache: + g_free(new); + virtio_virtqueue_reset_region_cache(vq); +} + +/* virt queue functions */ +void virtio_queue_update_rings(VirtIODevice *vdev, int n) +{ + VRing *vring = &vdev->vq[n].vring; + + if (!vring->num || !vring->desc || !vring->align) { + /* not yet setup -> nothing to do */ + return; + } + vring->avail = vring->desc + vring->num * sizeof(VRingDesc); + vring->used = vring_align(vring->avail + + offsetof(VRingAvail, ring[vring->num]), + vring->align); + virtio_init_region_cache(vdev, n); +} + +/* Called within rcu_read_lock(). */ +static void vring_split_desc_read(VirtIODevice *vdev, VRingDesc *desc, + MemoryRegionCache *cache, int i) +{ + address_space_read_cached(cache, i * sizeof(VRingDesc), + desc, sizeof(VRingDesc)); + virtio_tswap64s(vdev, &desc->addr); + virtio_tswap32s(vdev, &desc->len); + virtio_tswap16s(vdev, &desc->flags); + virtio_tswap16s(vdev, &desc->next); +} + +static void vring_packed_event_read(VirtIODevice *vdev, + MemoryRegionCache *cache, + VRingPackedDescEvent *e) +{ + hwaddr off_off = offsetof(VRingPackedDescEvent, off_wrap); + hwaddr off_flags = offsetof(VRingPackedDescEvent, flags); + + e->flags = virtio_lduw_phys_cached(vdev, cache, off_flags); + /* Make sure flags is seen before off_wrap */ + smp_rmb(); + e->off_wrap = virtio_lduw_phys_cached(vdev, cache, off_off); + virtio_tswap16s(vdev, &e->flags); +} + +static void vring_packed_off_wrap_write(VirtIODevice *vdev, + MemoryRegionCache *cache, + uint16_t off_wrap) +{ + hwaddr off = offsetof(VRingPackedDescEvent, off_wrap); + + virtio_stw_phys_cached(vdev, cache, off, off_wrap); + address_space_cache_invalidate(cache, off, sizeof(off_wrap)); +} + +static void vring_packed_flags_write(VirtIODevice *vdev, + MemoryRegionCache *cache, uint16_t flags) +{ + hwaddr off = offsetof(VRingPackedDescEvent, flags); + + virtio_stw_phys_cached(vdev, cache, off, flags); + address_space_cache_invalidate(cache, off, sizeof(flags)); +} + +/* Called within rcu_read_lock(). */ +static VRingMemoryRegionCaches *vring_get_region_caches(struct VirtQueue *vq) +{ + return qatomic_rcu_read(&vq->vring.caches); +} + +/* Called within rcu_read_lock(). */ +static inline uint16_t vring_avail_flags(VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingAvail, flags); + + if (!caches) { + return 0; + } + + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); +} + +/* Called within rcu_read_lock(). */ +static inline uint16_t vring_avail_idx(VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingAvail, idx); + + if (!caches) { + return 0; + } + + vq->shadow_avail_idx = virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); + return vq->shadow_avail_idx; +} + +/* Called within rcu_read_lock(). */ +static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingAvail, ring[i]); + + if (!caches) { + return 0; + } + + return virtio_lduw_phys_cached(vq->vdev, &caches->avail, pa); +} + +/* Called within rcu_read_lock(). */ +static inline uint16_t vring_get_used_event(VirtQueue *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +/* Called within rcu_read_lock(). */ +static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, + int i) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingUsed, ring[i]); + + if (!caches) { + return; + } + + virtio_tswap32s(vq->vdev, &uelem->id); + virtio_tswap32s(vq->vdev, &uelem->len); + address_space_write_cached(&caches->used, pa, uelem, sizeof(VRingUsedElem)); + address_space_cache_invalidate(&caches->used, pa, sizeof(VRingUsedElem)); +} + +/* Called within rcu_read_lock(). */ +static inline uint16_t vring_used_flags(VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingUsed, flags); + + if (!caches) { + return 0; + } + + return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); +} + +/* Called within rcu_read_lock(). */ +static uint16_t vring_used_idx(VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingUsed, idx); + + if (!caches) { + return 0; + } + + return virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); +} + +/* Called within rcu_read_lock(). */ +static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + hwaddr pa = offsetof(VRingUsed, idx); + + if (caches) { + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); + address_space_cache_invalidate(&caches->used, pa, sizeof(val)); + } + + vq->used_idx = val; +} + +/* Called within rcu_read_lock(). */ +static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + VirtIODevice *vdev = vq->vdev; + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags; + + if (!caches) { + return; + } + + flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + virtio_stw_phys_cached(vdev, &caches->used, pa, flags | mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); +} + +/* Called within rcu_read_lock(). */ +static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) +{ + VRingMemoryRegionCaches *caches = vring_get_region_caches(vq); + VirtIODevice *vdev = vq->vdev; + hwaddr pa = offsetof(VRingUsed, flags); + uint16_t flags; + + if (!caches) { + return; + } + + flags = virtio_lduw_phys_cached(vq->vdev, &caches->used, pa); + virtio_stw_phys_cached(vdev, &caches->used, pa, flags & ~mask); + address_space_cache_invalidate(&caches->used, pa, sizeof(flags)); +} + +/* Called within rcu_read_lock(). */ +static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) +{ + VRingMemoryRegionCaches *caches; + hwaddr pa; + if (!vq->notification) { + return; + } + + caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + + pa = offsetof(VRingUsed, ring[vq->vring.num]); + virtio_stw_phys_cached(vq->vdev, &caches->used, pa, val); + address_space_cache_invalidate(&caches->used, pa, sizeof(val)); +} + +static void virtio_queue_split_set_notification(VirtQueue *vq, int enable) +{ + RCU_READ_LOCK_GUARD(); + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vring_avail_idx(vq)); + } else if (enable) { + vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); + } else { + vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); + } + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } +} + +static void virtio_queue_packed_set_notification(VirtQueue *vq, int enable) +{ + uint16_t off_wrap; + VRingPackedDescEvent e; + VRingMemoryRegionCaches *caches; + + RCU_READ_LOCK_GUARD(); + caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + + vring_packed_event_read(vq->vdev, &caches->used, &e); + + if (!enable) { + e.flags = VRING_PACKED_EVENT_FLAG_DISABLE; + } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + off_wrap = vq->shadow_avail_idx | vq->shadow_avail_wrap_counter << 15; + vring_packed_off_wrap_write(vq->vdev, &caches->used, off_wrap); + /* Make sure off_wrap is wrote before flags */ + smp_wmb(); + e.flags = VRING_PACKED_EVENT_FLAG_DESC; + } else { + e.flags = VRING_PACKED_EVENT_FLAG_ENABLE; + } + + vring_packed_flags_write(vq->vdev, &caches->used, e.flags); + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } +} + +bool virtio_queue_get_notification(VirtQueue *vq) +{ + return vq->notification; +} + +void virtio_queue_set_notification(VirtQueue *vq, int enable) +{ + vq->notification = enable; + + if (!vq->vring.desc) { + return; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtio_queue_packed_set_notification(vq, enable); + } else { + virtio_queue_split_set_notification(vq, enable); + } +} + +int virtio_queue_ready(VirtQueue *vq) +{ + return vq->vring.avail != 0; +} + +static void vring_packed_desc_read_flags(VirtIODevice *vdev, + uint16_t *flags, + MemoryRegionCache *cache, + int i) +{ + hwaddr off = i * sizeof(VRingPackedDesc) + offsetof(VRingPackedDesc, flags); + + *flags = virtio_lduw_phys_cached(vdev, cache, off); +} + +static void vring_packed_desc_read(VirtIODevice *vdev, + VRingPackedDesc *desc, + MemoryRegionCache *cache, + int i, bool strict_order) +{ + hwaddr off = i * sizeof(VRingPackedDesc); + + vring_packed_desc_read_flags(vdev, &desc->flags, cache, i); + + if (strict_order) { + /* Make sure flags is read before the rest fields. */ + smp_rmb(); + } + + address_space_read_cached(cache, off + offsetof(VRingPackedDesc, addr), + &desc->addr, sizeof(desc->addr)); + address_space_read_cached(cache, off + offsetof(VRingPackedDesc, id), + &desc->id, sizeof(desc->id)); + address_space_read_cached(cache, off + offsetof(VRingPackedDesc, len), + &desc->len, sizeof(desc->len)); + virtio_tswap64s(vdev, &desc->addr); + virtio_tswap16s(vdev, &desc->id); + virtio_tswap32s(vdev, &desc->len); +} + +static void vring_packed_desc_write_data(VirtIODevice *vdev, + VRingPackedDesc *desc, + MemoryRegionCache *cache, + int i) +{ + hwaddr off_id = i * sizeof(VRingPackedDesc) + + offsetof(VRingPackedDesc, id); + hwaddr off_len = i * sizeof(VRingPackedDesc) + + offsetof(VRingPackedDesc, len); + + virtio_tswap32s(vdev, &desc->len); + virtio_tswap16s(vdev, &desc->id); + address_space_write_cached(cache, off_id, &desc->id, sizeof(desc->id)); + address_space_cache_invalidate(cache, off_id, sizeof(desc->id)); + address_space_write_cached(cache, off_len, &desc->len, sizeof(desc->len)); + address_space_cache_invalidate(cache, off_len, sizeof(desc->len)); +} + +static void vring_packed_desc_write_flags(VirtIODevice *vdev, + VRingPackedDesc *desc, + MemoryRegionCache *cache, + int i) +{ + hwaddr off = i * sizeof(VRingPackedDesc) + offsetof(VRingPackedDesc, flags); + + virtio_stw_phys_cached(vdev, cache, off, desc->flags); + address_space_cache_invalidate(cache, off, sizeof(desc->flags)); +} + +static void vring_packed_desc_write(VirtIODevice *vdev, + VRingPackedDesc *desc, + MemoryRegionCache *cache, + int i, bool strict_order) +{ + vring_packed_desc_write_data(vdev, desc, cache, i); + if (strict_order) { + /* Make sure data is wrote before flags. */ + smp_wmb(); + } + vring_packed_desc_write_flags(vdev, desc, cache, i); +} + +static inline bool is_desc_avail(uint16_t flags, bool wrap_counter) +{ + bool avail, used; + + avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); + used = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); + return (avail != used) && (avail == wrap_counter); +} + +/* Fetch avail_idx from VQ memory only when we really need to know if + * guest has added some buffers. + * Called within rcu_read_lock(). */ +static int virtio_queue_empty_rcu(VirtQueue *vq) +{ + if (virtio_device_disabled(vq->vdev)) { + return 1; + } + + if (unlikely(!vq->vring.avail)) { + return 1; + } + + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static int virtio_queue_split_empty(VirtQueue *vq) +{ + bool empty; + + if (virtio_device_disabled(vq->vdev)) { + return 1; + } + + if (unlikely(!vq->vring.avail)) { + return 1; + } + + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + RCU_READ_LOCK_GUARD(); + empty = vring_avail_idx(vq) == vq->last_avail_idx; + return empty; +} + +/* Called within rcu_read_lock(). */ +static int virtio_queue_packed_empty_rcu(VirtQueue *vq) +{ + struct VRingPackedDesc desc; + VRingMemoryRegionCaches *cache; + + if (unlikely(!vq->vring.desc)) { + return 1; + } + + cache = vring_get_region_caches(vq); + if (!cache) { + return 1; + } + + vring_packed_desc_read_flags(vq->vdev, &desc.flags, &cache->desc, + vq->last_avail_idx); + + return !is_desc_avail(desc.flags, vq->last_avail_wrap_counter); +} + +static int virtio_queue_packed_empty(VirtQueue *vq) +{ + RCU_READ_LOCK_GUARD(); + return virtio_queue_packed_empty_rcu(vq); +} + +int virtio_queue_empty(VirtQueue *vq) +{ + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + return virtio_queue_packed_empty(vq); + } else { + return virtio_queue_split_empty(vq); + } +} + +static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + AddressSpace *dma_as = vq->vdev->dma_as; + unsigned int offset; + int i; + + offset = 0; + for (i = 0; i < elem->in_num; i++) { + size_t size = MIN(len - offset, elem->in_sg[i].iov_len); + + dma_memory_unmap(dma_as, elem->in_sg[i].iov_base, + elem->in_sg[i].iov_len, + DMA_DIRECTION_FROM_DEVICE, size); + + offset += size; + } + + for (i = 0; i < elem->out_num; i++) + dma_memory_unmap(dma_as, elem->out_sg[i].iov_base, + elem->out_sg[i].iov_len, + DMA_DIRECTION_TO_DEVICE, + elem->out_sg[i].iov_len); +} + +/* virtqueue_detach_element: + * @vq: The #VirtQueue + * @elem: The #VirtQueueElement + * @len: number of bytes written + * + * Detach the element from the virtqueue. This function is suitable for device + * reset or other situations where a #VirtQueueElement is simply freed and will + * not be pushed or discarded. + */ +void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + vq->inuse -= elem->ndescs; + virtqueue_unmap_sg(vq, elem, len); +} + +static void virtqueue_split_rewind(VirtQueue *vq, unsigned int num) +{ + vq->last_avail_idx -= num; +} + +static void virtqueue_packed_rewind(VirtQueue *vq, unsigned int num) +{ + if (vq->last_avail_idx < num) { + vq->last_avail_idx = vq->vring.num + vq->last_avail_idx - num; + vq->last_avail_wrap_counter ^= 1; + } else { + vq->last_avail_idx -= num; + } +} + +/* virtqueue_unpop: + * @vq: The #VirtQueue + * @elem: The #VirtQueueElement + * @len: number of bytes written + * + * Pretend the most recent element wasn't popped from the virtqueue. The next + * call to virtqueue_pop() will refetch the element. + */ +void virtqueue_unpop(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtqueue_packed_rewind(vq, 1); + } else { + virtqueue_split_rewind(vq, 1); + } + + virtqueue_detach_element(vq, elem, len); +} + +/* virtqueue_rewind: + * @vq: The #VirtQueue + * @num: Number of elements to push back + * + * Pretend that elements weren't popped from the virtqueue. The next + * virtqueue_pop() will refetch the oldest element. + * + * Use virtqueue_unpop() instead if you have a VirtQueueElement. + * + * Returns: true on success, false if @num is greater than the number of in use + * elements. + */ +bool virtqueue_rewind(VirtQueue *vq, unsigned int num) +{ + if (num > vq->inuse) { + return false; + } + + vq->inuse -= num; + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtqueue_packed_rewind(vq, num); + } else { + virtqueue_split_rewind(vq, num); + } + return true; +} + +static void virtqueue_split_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + VRingUsedElem uelem; + + if (unlikely(!vq->vring.used)) { + return; + } + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = elem->index; + uelem.len = len; + vring_used_write(vq, &uelem, idx); +} + +static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + vq->used_elems[idx].index = elem->index; + vq->used_elems[idx].len = len; + vq->used_elems[idx].ndescs = elem->ndescs; +} + +static void virtqueue_packed_fill_desc(VirtQueue *vq, + const VirtQueueElement *elem, + unsigned int idx, + bool strict_order) +{ + uint16_t head; + VRingMemoryRegionCaches *caches; + VRingPackedDesc desc = { + .id = elem->index, + .len = elem->len, + }; + bool wrap_counter = vq->used_wrap_counter; + + if (unlikely(!vq->vring.desc)) { + return; + } + + head = vq->used_idx + idx; + if (head >= vq->vring.num) { + head -= vq->vring.num; + wrap_counter ^= 1; + } + if (wrap_counter) { + desc.flags |= (1 << VRING_PACKED_DESC_F_AVAIL); + desc.flags |= (1 << VRING_PACKED_DESC_F_USED); + } else { + desc.flags &= ~(1 << VRING_PACKED_DESC_F_AVAIL); + desc.flags &= ~(1 << VRING_PACKED_DESC_F_USED); + } + + caches = vring_get_region_caches(vq); + if (!caches) { + return; + } + + vring_packed_desc_write(vq->vdev, &desc, &caches->desc, head, strict_order); +} + +/* Called within rcu_read_lock(). */ +void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + trace_virtqueue_fill(vq, elem, len, idx); + + virtqueue_unmap_sg(vq, elem, len); + + if (virtio_device_disabled(vq->vdev)) { + return; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtqueue_packed_fill(vq, elem, len, idx); + } else { + virtqueue_split_fill(vq, elem, len, idx); + } +} + +/* Called within rcu_read_lock(). */ +static void virtqueue_split_flush(VirtQueue *vq, unsigned int count) +{ + uint16_t old, new; + + if (unlikely(!vq->vring.used)) { + return; + } + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + trace_virtqueue_flush(vq, count); + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) + vq->signalled_used_valid = false; +} + +static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) +{ + unsigned int i, ndescs = 0; + + if (unlikely(!vq->vring.desc)) { + return; + } + + for (i = 1; i < count; i++) { + virtqueue_packed_fill_desc(vq, &vq->used_elems[i], i, false); + ndescs += vq->used_elems[i].ndescs; + } + virtqueue_packed_fill_desc(vq, &vq->used_elems[0], 0, true); + ndescs += vq->used_elems[0].ndescs; + + vq->inuse -= ndescs; + vq->used_idx += ndescs; + if (vq->used_idx >= vq->vring.num) { + vq->used_idx -= vq->vring.num; + vq->used_wrap_counter ^= 1; + vq->signalled_used_valid = false; + } +} + +void virtqueue_flush(VirtQueue *vq, unsigned int count) +{ + if (virtio_device_disabled(vq->vdev)) { + vq->inuse -= count; + return; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtqueue_packed_flush(vq, count); + } else { + virtqueue_split_flush(vq, count); + } +} + +void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + RCU_READ_LOCK_GUARD(); + virtqueue_fill(vq, elem, len, 0); + virtqueue_flush(vq, 1); +} + +/* Called within rcu_read_lock(). */ +static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) +{ + uint16_t num_heads = vring_avail_idx(vq) - idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (num_heads > vq->vring.num) { + virtio_error(vq->vdev, "Guest moved used index from %u to %u", + idx, vq->shadow_avail_idx); + return -EINVAL; + } + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + if (num_heads) { + smp_rmb(); + } + + return num_heads; +} + +/* Called within rcu_read_lock(). */ +static bool virtqueue_get_head(VirtQueue *vq, unsigned int idx, + unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + virtio_error(vq->vdev, "Guest says index %u is available", *head); + return false; + } + + return true; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int virtqueue_split_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, + MemoryRegionCache *desc_cache, + unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + /* Check they're not leading us off end of descriptors. */ + *next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + virtio_error(vdev, "Desc next is %u", *next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + vring_split_desc_read(vdev, desc, desc_cache, *next); + return VIRTQUEUE_READ_DESC_MORE; +} + +/* Called within rcu_read_lock(). */ +static void virtqueue_split_get_avail_bytes(VirtQueue *vq, + unsigned int *in_bytes, unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes, + VRingMemoryRegionCaches *caches) +{ + VirtIODevice *vdev = vq->vdev; + unsigned int max, idx; + unsigned int total_bufs, in_total, out_total; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + int64_t len = 0; + int rc; + + idx = vq->last_avail_idx; + total_bufs = in_total = out_total = 0; + + max = vq->vring.num; + + while ((rc = virtqueue_num_heads(vq, idx)) > 0) { + MemoryRegionCache *desc_cache = &caches->desc; + unsigned int num_bufs; + VRingDesc desc; + unsigned int i; + + num_bufs = total_bufs; + + if (!virtqueue_get_head(vq, idx++, &i)) { + goto err; + } + + vring_split_desc_read(vdev, &desc, desc_cache, i); + + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (!desc.len || (desc.len % sizeof(VRingDesc))) { + virtio_error(vdev, "Invalid size for indirect buffer table"); + goto err; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (num_bufs >= max) { + virtio_error(vdev, "Looped descriptor"); + goto err; + } + + /* loop over the indirect descriptor table */ + len = address_space_cache_init(&indirect_desc_cache, + vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto err; + } + + max = desc.len / sizeof(VRingDesc); + num_bufs = i = 0; + vring_split_desc_read(vdev, &desc, desc_cache, i); + } + + do { + /* If we've got too many, that implies a descriptor loop. */ + if (++num_bufs > max) { + virtio_error(vdev, "Looped descriptor"); + goto err; + } + + if (desc.flags & VRING_DESC_F_WRITE) { + in_total += desc.len; + } else { + out_total += desc.len; + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; + } + + rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + goto err; + } + + if (desc_cache == &indirect_desc_cache) { + address_space_cache_destroy(&indirect_desc_cache); + total_bufs++; + } else { + total_bufs = num_bufs; + } + } + + if (rc < 0) { + goto err; + } + +done: + address_space_cache_destroy(&indirect_desc_cache); + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } + return; + +err: + in_total = out_total = 0; + goto done; +} + +static int virtqueue_packed_read_next_desc(VirtQueue *vq, + VRingPackedDesc *desc, + MemoryRegionCache + *desc_cache, + unsigned int max, + unsigned int *next, + bool indirect) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!indirect && !(desc->flags & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + ++*next; + if (*next == max) { + if (indirect) { + return VIRTQUEUE_READ_DESC_DONE; + } else { + (*next) -= vq->vring.num; + } + } + + vring_packed_desc_read(vq->vdev, desc, desc_cache, *next, false); + return VIRTQUEUE_READ_DESC_MORE; +} + +/* Called within rcu_read_lock(). */ +static void virtqueue_packed_get_avail_bytes(VirtQueue *vq, + unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, + unsigned max_out_bytes, + VRingMemoryRegionCaches *caches) +{ + VirtIODevice *vdev = vq->vdev; + unsigned int max, idx; + unsigned int total_bufs, in_total, out_total; + MemoryRegionCache *desc_cache; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + int64_t len = 0; + VRingPackedDesc desc; + bool wrap_counter; + + idx = vq->last_avail_idx; + wrap_counter = vq->last_avail_wrap_counter; + total_bufs = in_total = out_total = 0; + + max = vq->vring.num; + + for (;;) { + unsigned int num_bufs = total_bufs; + unsigned int i = idx; + int rc; + + desc_cache = &caches->desc; + vring_packed_desc_read(vdev, &desc, desc_cache, idx, true); + if (!is_desc_avail(desc.flags, wrap_counter)) { + break; + } + + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingPackedDesc)) { + virtio_error(vdev, "Invalid size for indirect buffer table"); + goto err; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (num_bufs >= max) { + virtio_error(vdev, "Looped descriptor"); + goto err; + } + + /* loop over the indirect descriptor table */ + len = address_space_cache_init(&indirect_desc_cache, + vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto err; + } + + max = desc.len / sizeof(VRingPackedDesc); + num_bufs = i = 0; + vring_packed_desc_read(vdev, &desc, desc_cache, i, false); + } + + do { + /* If we've got too many, that implies a descriptor loop. */ + if (++num_bufs > max) { + virtio_error(vdev, "Looped descriptor"); + goto err; + } + + if (desc.flags & VRING_DESC_F_WRITE) { + in_total += desc.len; + } else { + out_total += desc.len; + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; + } + + rc = virtqueue_packed_read_next_desc(vq, &desc, desc_cache, max, + &i, desc_cache == + &indirect_desc_cache); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (desc_cache == &indirect_desc_cache) { + address_space_cache_destroy(&indirect_desc_cache); + total_bufs++; + idx++; + } else { + idx += num_bufs - total_bufs; + total_bufs = num_bufs; + } + + if (idx >= vq->vring.num) { + idx -= vq->vring.num; + wrap_counter ^= 1; + } + } + + /* Record the index and wrap counter for a kick we want */ + vq->shadow_avail_idx = idx; + vq->shadow_avail_wrap_counter = wrap_counter; +done: + address_space_cache_destroy(&indirect_desc_cache); + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } + return; + +err: + in_total = out_total = 0; + goto done; +} + +void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes) +{ + uint16_t desc_size; + VRingMemoryRegionCaches *caches; + + RCU_READ_LOCK_GUARD(); + + if (unlikely(!vq->vring.desc)) { + goto err; + } + + caches = vring_get_region_caches(vq); + if (!caches) { + goto err; + } + + desc_size = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) ? + sizeof(VRingPackedDesc) : sizeof(VRingDesc); + if (caches->desc.len < vq->vring.num * desc_size) { + virtio_error(vq->vdev, "Cannot map descriptor ring"); + goto err; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + virtqueue_packed_get_avail_bytes(vq, in_bytes, out_bytes, + max_in_bytes, max_out_bytes, + caches); + } else { + virtqueue_split_get_avail_bytes(vq, in_bytes, out_bytes, + max_in_bytes, max_out_bytes, + caches); + } + + return; +err: + if (in_bytes) { + *in_bytes = 0; + } + if (out_bytes) { + *out_bytes = 0; + } +} + +int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, + unsigned int out_bytes) +{ + unsigned int in_total, out_total; + + virtqueue_get_avail_bytes(vq, &in_total, &out_total, in_bytes, out_bytes); + return in_bytes <= in_total && out_bytes <= out_total; +} + +static bool virtqueue_map_desc(VirtIODevice *vdev, unsigned int *p_num_sg, + hwaddr *addr, struct iovec *iov, + unsigned int max_num_sg, bool is_write, + hwaddr pa, size_t sz) +{ + bool ok = false; + unsigned num_sg = *p_num_sg; + assert(num_sg <= max_num_sg); + + if (!sz) { + virtio_error(vdev, "virtio: zero sized buffers are not allowed"); + goto out; + } + + while (sz) { + hwaddr len = sz; + + if (num_sg == max_num_sg) { + virtio_error(vdev, "virtio: too many write descriptors in " + "indirect table"); + goto out; + } + + iov[num_sg].iov_base = dma_memory_map(vdev->dma_as, pa, &len, + is_write ? + DMA_DIRECTION_FROM_DEVICE : + DMA_DIRECTION_TO_DEVICE, + MEMTXATTRS_UNSPECIFIED); + if (!iov[num_sg].iov_base) { + virtio_error(vdev, "virtio: bogus descriptor or out of resources"); + goto out; + } + + iov[num_sg].iov_len = len; + addr[num_sg] = pa; + + sz -= len; + pa += len; + num_sg++; + } + ok = true; + +out: + *p_num_sg = num_sg; + return ok; +} + +/* Only used by error code paths before we have a VirtQueueElement (therefore + * virtqueue_unmap_sg() can't be used). Assumes buffers weren't written to + * yet. + */ +static void virtqueue_undo_map_desc(unsigned int out_num, unsigned int in_num, + struct iovec *iov) +{ + unsigned int i; + + for (i = 0; i < out_num + in_num; i++) { + int is_write = i >= out_num; + + cpu_physical_memory_unmap(iov->iov_base, iov->iov_len, is_write, 0); + iov++; + } +} + +static void virtqueue_map_iovec(VirtIODevice *vdev, struct iovec *sg, + hwaddr *addr, unsigned int num_sg, + bool is_write) +{ + unsigned int i; + hwaddr len; + + for (i = 0; i < num_sg; i++) { + len = sg[i].iov_len; + sg[i].iov_base = dma_memory_map(vdev->dma_as, + addr[i], &len, is_write ? + DMA_DIRECTION_FROM_DEVICE : + DMA_DIRECTION_TO_DEVICE, + MEMTXATTRS_UNSPECIFIED); + if (!sg[i].iov_base) { + error_report("virtio: error trying to map MMIO memory"); + exit(1); + } + if (len != sg[i].iov_len) { + error_report("virtio: unexpected memory split"); + exit(1); + } + } +} + +void virtqueue_map(VirtIODevice *vdev, VirtQueueElement *elem) +{ + virtqueue_map_iovec(vdev, elem->in_sg, elem->in_addr, elem->in_num, true); + virtqueue_map_iovec(vdev, elem->out_sg, elem->out_addr, elem->out_num, + false); +} + +static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) +{ + VirtQueueElement *elem; + size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0])); + size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]); + size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]); + size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0])); + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); + + assert(sz >= sizeof(VirtQueueElement)); + elem = g_malloc(out_sg_end); + trace_virtqueue_alloc_element(elem, sz, in_num, out_num); + elem->out_num = out_num; + elem->in_num = in_num; + elem->in_addr = (void *)elem + in_addr_ofs; + elem->out_addr = (void *)elem + out_addr_ofs; + elem->in_sg = (void *)elem + in_sg_ofs; + elem->out_sg = (void *)elem + out_sg_ofs; + return elem; +} + +static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) +{ + unsigned int i, head, max; + VRingMemoryRegionCaches *caches; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + MemoryRegionCache *desc_cache; + int64_t len; + VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem = NULL; + unsigned out_num, in_num, elem_entries; + hwaddr addr[VIRTQUEUE_MAX_SIZE]; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + VRingDesc desc; + int rc; + + RCU_READ_LOCK_GUARD(); + if (virtio_queue_empty_rcu(vq)) { + goto done; + } + /* Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). */ + smp_rmb(); + + /* When we start there are none of either input nor output. */ + out_num = in_num = elem_entries = 0; + + max = vq->vring.num; + + if (vq->inuse >= vq->vring.num) { + virtio_error(vdev, "Virtqueue size exceeded"); + goto done; + } + + if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { + goto done; + } + + if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vq->last_avail_idx); + } + + i = head; + + caches = vring_get_region_caches(vq); + if (!caches) { + virtio_error(vdev, "Region caches not initialized"); + goto done; + } + + if (caches->desc.len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto done; + } + + desc_cache = &caches->desc; + vring_split_desc_read(vdev, &desc, desc_cache, i); + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (!desc.len || (desc.len % sizeof(VRingDesc))) { + virtio_error(vdev, "Invalid size for indirect buffer table"); + goto done; + } + + /* loop over the indirect descriptor table */ + len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto done; + } + + max = desc.len / sizeof(VRingDesc); + i = 0; + vring_split_desc_read(vdev, &desc, desc_cache, i); + } + + /* Collect all the descriptors */ + do { + bool map_ok; + + if (desc.flags & VRING_DESC_F_WRITE) { + map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num, + iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, + desc.addr, desc.len); + } else { + if (in_num) { + virtio_error(vdev, "Incorrect order for descriptors"); + goto err_undo_map; + } + map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov, + VIRTQUEUE_MAX_SIZE, false, + desc.addr, desc.len); + } + if (!map_ok) { + goto err_undo_map; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (++elem_entries > max) { + virtio_error(vdev, "Looped descriptor"); + goto err_undo_map; + } + + rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + goto err_undo_map; + } + + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); + elem->index = head; + elem->ndescs = 1; + for (i = 0; i < out_num; i++) { + elem->out_addr[i] = addr[i]; + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_addr[i] = addr[out_num + i]; + elem->in_sg[i] = iov[out_num + i]; + } + + vq->inuse++; + + trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); +done: + address_space_cache_destroy(&indirect_desc_cache); + + return elem; + +err_undo_map: + virtqueue_undo_map_desc(out_num, in_num, iov); + goto done; +} + +static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) +{ + unsigned int i, max; + VRingMemoryRegionCaches *caches; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + MemoryRegionCache *desc_cache; + int64_t len; + VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem = NULL; + unsigned out_num, in_num, elem_entries; + hwaddr addr[VIRTQUEUE_MAX_SIZE]; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + VRingPackedDesc desc; + uint16_t id; + int rc; + + RCU_READ_LOCK_GUARD(); + if (virtio_queue_packed_empty_rcu(vq)) { + goto done; + } + + /* When we start there are none of either input nor output. */ + out_num = in_num = elem_entries = 0; + + max = vq->vring.num; + + if (vq->inuse >= vq->vring.num) { + virtio_error(vdev, "Virtqueue size exceeded"); + goto done; + } + + i = vq->last_avail_idx; + + caches = vring_get_region_caches(vq); + if (!caches) { + virtio_error(vdev, "Region caches not initialized"); + goto done; + } + + if (caches->desc.len < max * sizeof(VRingDesc)) { + virtio_error(vdev, "Cannot map descriptor ring"); + goto done; + } + + desc_cache = &caches->desc; + vring_packed_desc_read(vdev, &desc, desc_cache, i, true); + id = desc.id; + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingPackedDesc)) { + virtio_error(vdev, "Invalid size for indirect buffer table"); + goto done; + } + + /* loop over the indirect descriptor table */ + len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + virtio_error(vdev, "Cannot map indirect buffer"); + goto done; + } + + max = desc.len / sizeof(VRingPackedDesc); + i = 0; + vring_packed_desc_read(vdev, &desc, desc_cache, i, false); + } + + /* Collect all the descriptors */ + do { + bool map_ok; + + if (desc.flags & VRING_DESC_F_WRITE) { + map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num, + iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, + desc.addr, desc.len); + } else { + if (in_num) { + virtio_error(vdev, "Incorrect order for descriptors"); + goto err_undo_map; + } + map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov, + VIRTQUEUE_MAX_SIZE, false, + desc.addr, desc.len); + } + if (!map_ok) { + goto err_undo_map; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (++elem_entries > max) { + virtio_error(vdev, "Looped descriptor"); + goto err_undo_map; + } + + rc = virtqueue_packed_read_next_desc(vq, &desc, desc_cache, max, &i, + desc_cache == + &indirect_desc_cache); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); + for (i = 0; i < out_num; i++) { + elem->out_addr[i] = addr[i]; + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_addr[i] = addr[out_num + i]; + elem->in_sg[i] = iov[out_num + i]; + } + + elem->index = id; + elem->ndescs = (desc_cache == &indirect_desc_cache) ? 1 : elem_entries; + vq->last_avail_idx += elem->ndescs; + vq->inuse += elem->ndescs; + + if (vq->last_avail_idx >= vq->vring.num) { + vq->last_avail_idx -= vq->vring.num; + vq->last_avail_wrap_counter ^= 1; + } + + vq->shadow_avail_idx = vq->last_avail_idx; + vq->shadow_avail_wrap_counter = vq->last_avail_wrap_counter; + + trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); +done: + address_space_cache_destroy(&indirect_desc_cache); + + return elem; + +err_undo_map: + virtqueue_undo_map_desc(out_num, in_num, iov); + goto done; +} + +void *virtqueue_pop(VirtQueue *vq, size_t sz) +{ + if (virtio_device_disabled(vq->vdev)) { + return NULL; + } + + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + return virtqueue_packed_pop(vq, sz); + } else { + return virtqueue_split_pop(vq, sz); + } +} + +static unsigned int virtqueue_packed_drop_all(VirtQueue *vq) +{ + VRingMemoryRegionCaches *caches; + MemoryRegionCache *desc_cache; + unsigned int dropped = 0; + VirtQueueElement elem = {}; + VirtIODevice *vdev = vq->vdev; + VRingPackedDesc desc; + + RCU_READ_LOCK_GUARD(); + + caches = vring_get_region_caches(vq); + if (!caches) { + return 0; + } + + desc_cache = &caches->desc; + + virtio_queue_set_notification(vq, 0); + + while (vq->inuse < vq->vring.num) { + unsigned int idx = vq->last_avail_idx; + /* + * works similar to virtqueue_pop but does not map buffers + * and does not allocate any memory. + */ + vring_packed_desc_read(vdev, &desc, desc_cache, + vq->last_avail_idx , true); + if (!is_desc_avail(desc.flags, vq->last_avail_wrap_counter)) { + break; + } + elem.index = desc.id; + elem.ndescs = 1; + while (virtqueue_packed_read_next_desc(vq, &desc, desc_cache, + vq->vring.num, &idx, false)) { + ++elem.ndescs; + } + /* + * immediately push the element, nothing to unmap + * as both in_num and out_num are set to 0. + */ + virtqueue_push(vq, &elem, 0); + dropped++; + vq->last_avail_idx += elem.ndescs; + if (vq->last_avail_idx >= vq->vring.num) { + vq->last_avail_idx -= vq->vring.num; + vq->last_avail_wrap_counter ^= 1; + } + } + + return dropped; +} + +static unsigned int virtqueue_split_drop_all(VirtQueue *vq) +{ + unsigned int dropped = 0; + VirtQueueElement elem = {}; + VirtIODevice *vdev = vq->vdev; + bool fEventIdx = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + + while (!virtio_queue_empty(vq) && vq->inuse < vq->vring.num) { + /* works similar to virtqueue_pop but does not map buffers + * and does not allocate any memory */ + smp_rmb(); + if (!virtqueue_get_head(vq, vq->last_avail_idx, &elem.index)) { + break; + } + vq->inuse++; + vq->last_avail_idx++; + if (fEventIdx) { + vring_set_avail_event(vq, vq->last_avail_idx); + } + /* immediately push the element, nothing to unmap + * as both in_num and out_num are set to 0 */ + virtqueue_push(vq, &elem, 0); + dropped++; + } + + return dropped; +} + +/* virtqueue_drop_all: + * @vq: The #VirtQueue + * Drops all queued buffers and indicates them to the guest + * as if they are done. Useful when buffers can not be + * processed but must be returned to the guest. + */ +unsigned int virtqueue_drop_all(VirtQueue *vq) +{ + struct VirtIODevice *vdev = vq->vdev; + + if (virtio_device_disabled(vq->vdev)) { + return 0; + } + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return virtqueue_packed_drop_all(vq); + } else { + return virtqueue_split_drop_all(vq); + } +} + +/* Reading and writing a structure directly to QEMUFile is *awful*, but + * it is what QEMU has always done by mistake. We can change it sooner + * or later by bumping the version number of the affected vm states. + * In the meanwhile, since the in-memory layout of VirtQueueElement + * has changed, we need to marshal to and from the layout that was + * used before the change. + */ +typedef struct VirtQueueElementOld { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + hwaddr in_addr[VIRTQUEUE_MAX_SIZE]; + hwaddr out_addr[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; +} VirtQueueElementOld; + +void *qemu_get_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, size_t sz) +{ + VirtQueueElement *elem; + VirtQueueElementOld data; + int i; + + qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); + + /* TODO: teach all callers that this can fail, and return failure instead + * of asserting here. + * This is just one thing (there are probably more) that must be + * fixed before we can allow NDEBUG compilation. + */ + assert(ARRAY_SIZE(data.in_addr) >= data.in_num); + assert(ARRAY_SIZE(data.out_addr) >= data.out_num); + + elem = virtqueue_alloc_element(sz, data.out_num, data.in_num); + elem->index = data.index; + + for (i = 0; i < elem->in_num; i++) { + elem->in_addr[i] = data.in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + elem->out_addr[i] = data.out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->in_sg[i].iov_base = 0; + elem->in_sg[i].iov_len = data.in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->out_sg[i].iov_base = 0; + elem->out_sg[i].iov_len = data.out_sg[i].iov_len; + } + + if (virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + qemu_get_be32s(f, &elem->ndescs); + } + + virtqueue_map(vdev, elem); + return elem; +} + +void qemu_put_virtqueue_element(VirtIODevice *vdev, QEMUFile *f, + VirtQueueElement *elem) +{ + VirtQueueElementOld data; + int i; + + memset(&data, 0, sizeof(data)); + data.index = elem->index; + data.in_num = elem->in_num; + data.out_num = elem->out_num; + + for (i = 0; i < elem->in_num; i++) { + data.in_addr[i] = elem->in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + data.out_addr[i] = elem->out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map when loading. Do not + * save it, as it would leak the QEMU address space layout. */ + data.in_sg[i].iov_len = elem->in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Do not save iov_base as above. */ + data.out_sg[i].iov_len = elem->out_sg[i].iov_len; + } + + if (virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + qemu_put_be32s(f, &elem->ndescs); + } + + qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); +} + +/* virtio device */ +static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (virtio_device_disabled(vdev)) { + return; + } + + if (k->notify) { + k->notify(qbus->parent, vector); + } +} + +void virtio_update_irq(VirtIODevice *vdev) +{ + virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); +} + +static int virtio_validate_features(VirtIODevice *vdev) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + + if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM) && + !virtio_vdev_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { + return -EFAULT; + } + + if (k->validate_features) { + return k->validate_features(vdev); + } else { + return 0; + } +} + +int virtio_set_status(VirtIODevice *vdev, uint8_t val) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + trace_virtio_set_status(vdev, val); + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) && + val & VIRTIO_CONFIG_S_FEATURES_OK) { + int ret = virtio_validate_features(vdev); + + if (ret) { + return ret; + } + } + } + + if ((vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) != + (val & VIRTIO_CONFIG_S_DRIVER_OK)) { + virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK); + } + + if (k->set_status) { + k->set_status(vdev, val); + } + vdev->status = val; + + return 0; +} + +static enum virtio_device_endian virtio_default_endian(void) +{ + if (target_words_bigendian()) { + return VIRTIO_DEVICE_ENDIAN_BIG; + } else { + return VIRTIO_DEVICE_ENDIAN_LITTLE; + } +} + +static enum virtio_device_endian virtio_current_cpu_endian(void) +{ + if (cpu_virtio_is_big_endian(current_cpu)) { + return VIRTIO_DEVICE_ENDIAN_BIG; + } else { + return VIRTIO_DEVICE_ENDIAN_LITTLE; + } +} + +static void __virtio_queue_reset(VirtIODevice *vdev, uint32_t i) +{ + vdev->vq[i].vring.desc = 0; + vdev->vq[i].vring.avail = 0; + vdev->vq[i].vring.used = 0; + vdev->vq[i].last_avail_idx = 0; + vdev->vq[i].shadow_avail_idx = 0; + vdev->vq[i].used_idx = 0; + vdev->vq[i].last_avail_wrap_counter = true; + vdev->vq[i].shadow_avail_wrap_counter = true; + vdev->vq[i].used_wrap_counter = true; + virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR); + vdev->vq[i].signalled_used = 0; + vdev->vq[i].signalled_used_valid = false; + vdev->vq[i].notification = true; + vdev->vq[i].vring.num = vdev->vq[i].vring.num_default; + vdev->vq[i].inuse = 0; + virtio_virtqueue_reset_region_cache(&vdev->vq[i]); +} + +void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + + if (k->queue_reset) { + k->queue_reset(vdev, queue_index); + } + + __virtio_queue_reset(vdev, queue_index); +} + +void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + + /* + * TODO: Seabios is currently out of spec and triggering this error. + * So this needs to be fixed in Seabios, then this can + * be re-enabled for new machine types only, and also after + * being converted to LOG_GUEST_ERROR. + * + if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + error_report("queue_enable is only suppported in devices of virtio " + "1.0 or later."); + } + */ + + if (k->queue_enable) { + k->queue_enable(vdev, queue_index); + } +} + +void virtio_reset(void *opaque) +{ + VirtIODevice *vdev = opaque; + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + int i; + + virtio_set_status(vdev, 0); + if (current_cpu) { + /* Guest initiated reset */ + vdev->device_endian = virtio_current_cpu_endian(); + } else { + /* System reset */ + vdev->device_endian = virtio_default_endian(); + } + + if (k->reset) { + k->reset(vdev); + } + + vdev->start_on_kick = false; + vdev->started = false; + vdev->broken = false; + vdev->guest_features = 0; + vdev->queue_sel = 0; + vdev->status = 0; + vdev->disabled = false; + qatomic_set(&vdev->isr, 0); + vdev->config_vector = VIRTIO_NO_VECTOR; + virtio_notify_vector(vdev, vdev->config_vector); + + for(i = 0; i < VIRTIO_QUEUE_MAX; i++) { + __virtio_queue_reset(vdev, i); + } +} + +uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint8_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = ldub_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_readw(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint16_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = lduw_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_readl(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint32_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = ldl_p(vdev->config + addr); + return val; +} + +void virtio_config_writeb(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint8_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stb_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +void virtio_config_writew(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint16_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stw_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +void virtio_config_writel(VirtIODevice *vdev, uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint32_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stl_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +uint32_t virtio_config_modern_readb(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint8_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = ldub_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_modern_readw(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint16_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = lduw_le_p(vdev->config + addr); + return val; +} + +uint32_t virtio_config_modern_readl(VirtIODevice *vdev, uint32_t addr) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint32_t val; + + if (addr + sizeof(val) > vdev->config_len) { + return (uint32_t)-1; + } + + k->get_config(vdev, vdev->config); + + val = ldl_le_p(vdev->config + addr); + return val; +} + +void virtio_config_modern_writeb(VirtIODevice *vdev, + uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint8_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stb_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +void virtio_config_modern_writew(VirtIODevice *vdev, + uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint16_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stw_le_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +void virtio_config_modern_writel(VirtIODevice *vdev, + uint32_t addr, uint32_t data) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + uint32_t val = data; + + if (addr + sizeof(val) > vdev->config_len) { + return; + } + + stl_le_p(vdev->config + addr, val); + + if (k->set_config) { + k->set_config(vdev, vdev->config); + } +} + +void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr) +{ + if (!vdev->vq[n].vring.num) { + return; + } + vdev->vq[n].vring.desc = addr; + virtio_queue_update_rings(vdev, n); +} + +hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.desc; +} + +void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc, + hwaddr avail, hwaddr used) +{ + if (!vdev->vq[n].vring.num) { + return; + } + vdev->vq[n].vring.desc = desc; + vdev->vq[n].vring.avail = avail; + vdev->vq[n].vring.used = used; + virtio_init_region_cache(vdev, n); +} + +void virtio_queue_set_num(VirtIODevice *vdev, int n, int num) +{ + /* Don't allow guest to flip queue between existent and + * nonexistent states, or to set it to an invalid size. + */ + if (!!num != !!vdev->vq[n].vring.num || + num > VIRTQUEUE_MAX_SIZE || + num < 0) { + return; + } + vdev->vq[n].vring.num = num; +} + +VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector) +{ + return QLIST_FIRST(&vdev->vector_queues[vector]); +} + +VirtQueue *virtio_vector_next_queue(VirtQueue *vq) +{ + return QLIST_NEXT(vq, node); +} + +int virtio_queue_get_num(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.num; +} + +int virtio_queue_get_max_num(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.num_default; +} + +int virtio_get_num_queues(VirtIODevice *vdev) +{ + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (!virtio_queue_get_num(vdev, i)) { + break; + } + } + + return i; +} + +void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + /* virtio-1 compliant devices cannot change the alignment */ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + error_report("tried to modify queue alignment for virtio-1 device"); + return; + } + /* Check that the transport told us it was going to do this + * (so a buggy transport will immediately assert rather than + * silently failing to migrate this state) + */ + assert(k->has_variable_vring_alignment); + + if (align) { + vdev->vq[n].vring.align = align; + virtio_queue_update_rings(vdev, n); + } +} + +static void virtio_queue_notify_vq(VirtQueue *vq) +{ + if (vq->vring.desc && vq->handle_output) { + VirtIODevice *vdev = vq->vdev; + + if (unlikely(vdev->broken)) { + return; + } + + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); + vq->handle_output(vdev, vq); + + if (unlikely(vdev->start_on_kick)) { + virtio_set_started(vdev, true); + } + } +} + +void virtio_queue_notify(VirtIODevice *vdev, int n) +{ + VirtQueue *vq = &vdev->vq[n]; + + if (unlikely(!vq->vring.desc || vdev->broken)) { + return; + } + + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); + if (vq->host_notifier_enabled) { + event_notifier_set(&vq->host_notifier); + } else if (vq->handle_output) { + vq->handle_output(vdev, vq); + + if (unlikely(vdev->start_on_kick)) { + virtio_set_started(vdev, true); + } + } +} + +uint16_t virtio_queue_vector(VirtIODevice *vdev, int n) +{ + return n < VIRTIO_QUEUE_MAX ? vdev->vq[n].vector : + VIRTIO_NO_VECTOR; +} + +void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector) +{ + VirtQueue *vq = &vdev->vq[n]; + + if (n < VIRTIO_QUEUE_MAX) { + if (vdev->vector_queues && + vdev->vq[n].vector != VIRTIO_NO_VECTOR) { + QLIST_REMOVE(vq, node); + } + vdev->vq[n].vector = vector; + if (vdev->vector_queues && + vector != VIRTIO_NO_VECTOR) { + QLIST_INSERT_HEAD(&vdev->vector_queues[vector], vq, node); + } + } +} + +VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, + VirtIOHandleOutput handle_output) +{ + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + } + + if (i == VIRTIO_QUEUE_MAX || queue_size > VIRTQUEUE_MAX_SIZE) + abort(); + + vdev->vq[i].vring.num = queue_size; + vdev->vq[i].vring.num_default = queue_size; + vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN; + vdev->vq[i].handle_output = handle_output; + vdev->vq[i].used_elems = g_new0(VirtQueueElement, queue_size); + + return &vdev->vq[i]; +} + +void virtio_delete_queue(VirtQueue *vq) +{ + vq->vring.num = 0; + vq->vring.num_default = 0; + vq->handle_output = NULL; + g_free(vq->used_elems); + vq->used_elems = NULL; + virtio_virtqueue_reset_region_cache(vq); +} + +void virtio_del_queue(VirtIODevice *vdev, int n) +{ + if (n < 0 || n >= VIRTIO_QUEUE_MAX) { + abort(); + } + + virtio_delete_queue(&vdev->vq[n]); +} + +static void virtio_set_isr(VirtIODevice *vdev, int value) +{ + uint8_t old = qatomic_read(&vdev->isr); + + /* Do not write ISR if it does not change, so that its cacheline remains + * shared in the common case where the guest does not read it. + */ + if ((old & value) != value) { + qatomic_or(&vdev->isr, value); + } +} + +/* Called within rcu_read_lock(). */ +static bool virtio_split_should_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + uint16_t old, new; + bool v; + /* We need to expose used array entries before checking used event. */ + smp_mb(); + /* Always notify when queue is empty (when feature acknowledge) */ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && virtio_queue_empty(vq)) { + return true; + } + + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + } + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +static bool vring_packed_need_event(VirtQueue *vq, bool wrap, + uint16_t off_wrap, uint16_t new, + uint16_t old) +{ + int off = off_wrap & ~(1 << 15); + + if (wrap != off_wrap >> 15) { + off -= vq->vring.num; + } + + return vring_need_event(off, new, old); +} + +/* Called within rcu_read_lock(). */ +static bool virtio_packed_should_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + VRingPackedDescEvent e; + uint16_t old, new; + bool v; + VRingMemoryRegionCaches *caches; + + caches = vring_get_region_caches(vq); + if (!caches) { + return false; + } + + vring_packed_event_read(vdev, &caches->avail, &e); + + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + + if (e.flags == VRING_PACKED_EVENT_FLAG_DISABLE) { + return false; + } else if (e.flags == VRING_PACKED_EVENT_FLAG_ENABLE) { + return true; + } + + return !v || vring_packed_need_event(vq, vq->used_wrap_counter, + e.off_wrap, new, old); +} + +/* Called within rcu_read_lock(). */ +static bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return virtio_packed_should_notify(vdev, vq); + } else { + return virtio_split_should_notify(vdev, vq); + } +} + +void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq) +{ + WITH_RCU_READ_LOCK_GUARD() { + if (!virtio_should_notify(vdev, vq)) { + return; + } + } + + trace_virtio_notify_irqfd(vdev, vq); + + /* + * virtio spec 1.0 says ISR bit 0 should be ignored with MSI, but + * windows drivers included in virtio-win 1.8.0 (circa 2015) are + * incorrectly polling this bit during crashdump and hibernation + * in MSI mode, causing a hang if this bit is never updated. + * Recent releases of Windows do not really shut down, but rather + * log out and hibernate to make the next startup faster. Hence, + * this manifested as a more serious hang during shutdown with + * + * Next driver release from 2016 fixed this problem, so working around it + * is not a must, but it's easy to do so let's do it here. + * + * Note: it's safe to update ISR from any thread as it was switched + * to an atomic operation. + */ + virtio_set_isr(vq->vdev, 0x1); + event_notifier_set(&vq->guest_notifier); +} + +static void virtio_irq(VirtQueue *vq) +{ + virtio_set_isr(vq->vdev, 0x1); + virtio_notify_vector(vq->vdev, vq->vector); +} + +void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) +{ + WITH_RCU_READ_LOCK_GUARD() { + if (!virtio_should_notify(vdev, vq)) { + return; + } + } + + trace_virtio_notify(vdev, vq); + virtio_irq(vq); +} + +void virtio_notify_config(VirtIODevice *vdev) +{ + if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) + return; + + virtio_set_isr(vdev, 0x3); + vdev->generation++; + virtio_notify_vector(vdev, vdev->config_vector); +} + +static bool virtio_device_endian_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN); + if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + return vdev->device_endian != virtio_default_endian(); + } + /* Devices conforming to VIRTIO 1.0 or later are always LE. */ + return vdev->device_endian != VIRTIO_DEVICE_ENDIAN_LITTLE; +} + +static bool virtio_64bit_features_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return (vdev->host_features >> 32) != 0; +} + +static bool virtio_virtqueue_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1); +} + +static bool virtio_packed_virtqueue_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return virtio_host_has_feature(vdev, VIRTIO_F_RING_PACKED); +} + +static bool virtio_ringsize_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) { + return true; + } + } + return false; +} + +static bool virtio_extra_state_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + return k->has_extra_state && + k->has_extra_state(qbus->parent); +} + +static bool virtio_broken_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->broken; +} + +static bool virtio_started_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->started; +} + +static bool virtio_disabled_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->disabled; +} + +static const VMStateDescription vmstate_virtqueue = { + .name = "virtqueue_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64(vring.avail, struct VirtQueue), + VMSTATE_UINT64(vring.used, struct VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_packed_virtqueue = { + .name = "packed_virtqueue_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT16(last_avail_idx, struct VirtQueue), + VMSTATE_BOOL(last_avail_wrap_counter, struct VirtQueue), + VMSTATE_UINT16(used_idx, struct VirtQueue), + VMSTATE_BOOL(used_wrap_counter, struct VirtQueue), + VMSTATE_UINT32(inuse, struct VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_virtqueues = { + .name = "virtio/virtqueues", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_virtqueue_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_packed_virtqueues = { + .name = "virtio/packed_virtqueues", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_packed_virtqueue_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_packed_virtqueue, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_ringsize = { + .name = "ringsize_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(vring.num_default, struct VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_ringsize = { + .name = "virtio/ringsize", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_ringsize_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static int get_extra_state(QEMUFile *f, void *pv, size_t size, + const VMStateField *field) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (!k->load_extra_state) { + return -1; + } else { + return k->load_extra_state(qbus->parent, f); + } +} + +static int put_extra_state(QEMUFile *f, void *pv, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + k->save_extra_state(qbus->parent, f); + return 0; +} + +static const VMStateInfo vmstate_info_extra_state = { + .name = "virtqueue_extra_state", + .get = get_extra_state, + .put = put_extra_state, +}; + +static const VMStateDescription vmstate_virtio_extra_state = { + .name = "virtio/extra_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_extra_state_needed, + .fields = (VMStateField[]) { + { + .name = "extra_state", + .version_id = 0, + .field_exists = NULL, + .size = 0, + .info = &vmstate_info_extra_state, + .flags = VMS_SINGLE, + .offset = 0, + }, + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_device_endian = { + .name = "virtio/device_endian", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_device_endian_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT8(device_endian, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_64bit_features = { + .name = "virtio/64bit_features", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_64bit_features_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT64(guest_features, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_broken = { + .name = "virtio/broken", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_broken_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(broken, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_started = { + .name = "virtio/started", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_started_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(started, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_disabled = { + .name = "virtio/disabled", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_disabled_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(disabled, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio = { + .name = "virtio", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_virtio_device_endian, + &vmstate_virtio_64bit_features, + &vmstate_virtio_virtqueues, + &vmstate_virtio_ringsize, + &vmstate_virtio_broken, + &vmstate_virtio_extra_state, + &vmstate_virtio_started, + &vmstate_virtio_packed_virtqueues, + &vmstate_virtio_disabled, + NULL + } +}; + +int virtio_save(VirtIODevice *vdev, QEMUFile *f) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + uint32_t guest_features_lo = (vdev->guest_features & 0xffffffff); + int i; + + if (k->save_config) { + k->save_config(qbus->parent, f); + } + + qemu_put_8s(f, &vdev->status); + qemu_put_8s(f, &vdev->isr); + qemu_put_be16s(f, &vdev->queue_sel); + qemu_put_be32s(f, &guest_features_lo); + qemu_put_be32(f, vdev->config_len); + qemu_put_buffer(f, vdev->config, vdev->config_len); + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + } + + qemu_put_be32(f, i); + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) + break; + + qemu_put_be32(f, vdev->vq[i].vring.num); + if (k->has_variable_vring_alignment) { + qemu_put_be32(f, vdev->vq[i].vring.align); + } + /* + * Save desc now, the rest of the ring addresses are saved in + * subsections for VIRTIO-1 devices. + */ + qemu_put_be64(f, vdev->vq[i].vring.desc); + qemu_put_be16s(f, &vdev->vq[i].last_avail_idx); + if (k->save_queue) { + k->save_queue(qbus->parent, i, f); + } + } + + if (vdc->save != NULL) { + vdc->save(vdev, f); + } + + if (vdc->vmsd) { + int ret = vmstate_save_state(f, vdc->vmsd, vdev, NULL); + if (ret) { + return ret; + } + } + + /* Subsections */ + return vmstate_save_state(f, &vmstate_virtio, vdev, NULL); +} + +/* A wrapper for use as a VMState .put function */ +static int virtio_device_put(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field, JSONWriter *vmdesc) +{ + return virtio_save(VIRTIO_DEVICE(opaque), f); +} + +/* A wrapper for use as a VMState .get function */ +static int virtio_device_get(QEMUFile *f, void *opaque, size_t size, + const VMStateField *field) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(opaque); + DeviceClass *dc = DEVICE_CLASS(VIRTIO_DEVICE_GET_CLASS(vdev)); + + return virtio_load(vdev, f, dc->vmsd->version_id); +} + +const VMStateInfo virtio_vmstate_info = { + .name = "virtio", + .get = virtio_device_get, + .put = virtio_device_put, +}; + +static int virtio_set_features_nocheck(VirtIODevice *vdev, uint64_t val) +{ + VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); + bool bad = (val & ~(vdev->host_features)) != 0; + + val &= vdev->host_features; + if (k->set_features) { + k->set_features(vdev, val); + } + vdev->guest_features = val; + return bad ? -1 : 0; +} + +int virtio_set_features(VirtIODevice *vdev, uint64_t val) +{ + int ret; + /* + * The driver must not attempt to set features after feature negotiation + * has finished. + */ + if (vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) { + return -EINVAL; + } + + if (val & (1ull << VIRTIO_F_BAD_FEATURE)) { + qemu_log_mask(LOG_GUEST_ERROR, + "%s: guest driver for %s has enabled UNUSED(30) feature bit!\n", + __func__, vdev->name); + } + + ret = virtio_set_features_nocheck(vdev, val); + if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + /* VIRTIO_RING_F_EVENT_IDX changes the size of the caches. */ + int i; + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num != 0) { + virtio_init_region_cache(vdev, i); + } + } + } + if (!ret) { + if (!virtio_device_started(vdev, vdev->status) && + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + vdev->start_on_kick = true; + } + } + return ret; +} + +size_t virtio_get_config_size(const VirtIOConfigSizeParams *params, + uint64_t host_features) +{ + size_t config_size = params->min_size; + const VirtIOFeature *feature_sizes = params->feature_sizes; + size_t i; + + for (i = 0; feature_sizes[i].flags != 0; i++) { + if (host_features & feature_sizes[i].flags) { + config_size = MAX(feature_sizes[i].end, config_size); + } + } + + assert(config_size <= params->max_size); + return config_size; +} + +int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) +{ + int i, ret; + int32_t config_len; + uint32_t num; + uint32_t features; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + + /* + * We poison the endianness to ensure it does not get used before + * subsections have been loaded. + */ + vdev->device_endian = VIRTIO_DEVICE_ENDIAN_UNKNOWN; + + if (k->load_config) { + ret = k->load_config(qbus->parent, f); + if (ret) + return ret; + } + + qemu_get_8s(f, &vdev->status); + qemu_get_8s(f, &vdev->isr); + qemu_get_be16s(f, &vdev->queue_sel); + if (vdev->queue_sel >= VIRTIO_QUEUE_MAX) { + return -1; + } + qemu_get_be32s(f, &features); + + /* + * Temporarily set guest_features low bits - needed by + * virtio net load code testing for VIRTIO_NET_F_CTRL_GUEST_OFFLOADS + * VIRTIO_NET_F_GUEST_ANNOUNCE and VIRTIO_NET_F_CTRL_VQ. + * + * Note: devices should always test host features in future - don't create + * new dependencies like this. + */ + vdev->guest_features = features; + + config_len = qemu_get_be32(f); + + /* + * There are cases where the incoming config can be bigger or smaller + * than what we have; so load what we have space for, and skip + * any excess that's in the stream. + */ + qemu_get_buffer(f, vdev->config, MIN(config_len, vdev->config_len)); + + while (config_len > vdev->config_len) { + qemu_get_byte(f); + config_len--; + } + + num = qemu_get_be32(f); + + if (num > VIRTIO_QUEUE_MAX) { + error_report("Invalid number of virtqueues: 0x%x", num); + return -1; + } + + for (i = 0; i < num; i++) { + vdev->vq[i].vring.num = qemu_get_be32(f); + if (k->has_variable_vring_alignment) { + vdev->vq[i].vring.align = qemu_get_be32(f); + } + vdev->vq[i].vring.desc = qemu_get_be64(f); + qemu_get_be16s(f, &vdev->vq[i].last_avail_idx); + vdev->vq[i].signalled_used_valid = false; + vdev->vq[i].notification = true; + + if (!vdev->vq[i].vring.desc && vdev->vq[i].last_avail_idx) { + error_report("VQ %d address 0x0 " + "inconsistent with Host index 0x%x", + i, vdev->vq[i].last_avail_idx); + return -1; + } + if (k->load_queue) { + ret = k->load_queue(qbus->parent, i, f); + if (ret) + return ret; + } + } + + virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); + + if (vdc->load != NULL) { + ret = vdc->load(vdev, f, version_id); + if (ret) { + return ret; + } + } + + if (vdc->vmsd) { + ret = vmstate_load_state(f, vdc->vmsd, vdev, version_id); + if (ret) { + return ret; + } + } + + /* Subsections */ + ret = vmstate_load_state(f, &vmstate_virtio, vdev, 1); + if (ret) { + return ret; + } + + if (vdev->device_endian == VIRTIO_DEVICE_ENDIAN_UNKNOWN) { + vdev->device_endian = virtio_default_endian(); + } + + if (virtio_64bit_features_needed(vdev)) { + /* + * Subsection load filled vdev->guest_features. Run them + * through virtio_set_features to sanity-check them against + * host_features. + */ + uint64_t features64 = vdev->guest_features; + if (virtio_set_features_nocheck(vdev, features64) < 0) { + error_report("Features 0x%" PRIx64 " unsupported. " + "Allowed features: 0x%" PRIx64, + features64, vdev->host_features); + return -1; + } + } else { + if (virtio_set_features_nocheck(vdev, features) < 0) { + error_report("Features 0x%x unsupported. " + "Allowed features: 0x%" PRIx64, + features, vdev->host_features); + return -1; + } + } + + if (!virtio_device_started(vdev, vdev->status) && + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + vdev->start_on_kick = true; + } + + RCU_READ_LOCK_GUARD(); + for (i = 0; i < num; i++) { + if (vdev->vq[i].vring.desc) { + uint16_t nheads; + + /* + * VIRTIO-1 devices migrate desc, used, and avail ring addresses so + * only the region cache needs to be set up. Legacy devices need + * to calculate used and avail ring addresses based on the desc + * address. + */ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + virtio_init_region_cache(vdev, i); + } else { + virtio_queue_update_rings(vdev, i); + } + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + vdev->vq[i].shadow_avail_idx = vdev->vq[i].last_avail_idx; + vdev->vq[i].shadow_avail_wrap_counter = + vdev->vq[i].last_avail_wrap_counter; + continue; + } + + nheads = vring_avail_idx(&vdev->vq[i]) - vdev->vq[i].last_avail_idx; + /* Check it isn't doing strange things with descriptor numbers. */ + if (nheads > vdev->vq[i].vring.num) { + virtio_error(vdev, "VQ %d size 0x%x Guest index 0x%x " + "inconsistent with Host index 0x%x: delta 0x%x", + i, vdev->vq[i].vring.num, + vring_avail_idx(&vdev->vq[i]), + vdev->vq[i].last_avail_idx, nheads); + vdev->vq[i].used_idx = 0; + vdev->vq[i].shadow_avail_idx = 0; + vdev->vq[i].inuse = 0; + continue; + } + vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]); + vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]); + + /* + * Some devices migrate VirtQueueElements that have been popped + * from the avail ring but not yet returned to the used ring. + * Since max ring size < UINT16_MAX it's safe to use modulo + * UINT16_MAX + 1 subtraction. + */ + vdev->vq[i].inuse = (uint16_t)(vdev->vq[i].last_avail_idx - + vdev->vq[i].used_idx); + if (vdev->vq[i].inuse > vdev->vq[i].vring.num) { + error_report("VQ %d size 0x%x < last_avail_idx 0x%x - " + "used_idx 0x%x", + i, vdev->vq[i].vring.num, + vdev->vq[i].last_avail_idx, + vdev->vq[i].used_idx); + return -1; + } + } + } + + if (vdc->post_load) { + ret = vdc->post_load(vdev); + if (ret) { + return ret; + } + } + + return 0; +} + +void virtio_cleanup(VirtIODevice *vdev) +{ + qemu_del_vm_change_state_handler(vdev->vmstate); +} + +static void virtio_vmstate_change(void *opaque, bool running, RunState state) +{ + VirtIODevice *vdev = opaque; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + bool backend_run = running && virtio_device_started(vdev, vdev->status); + vdev->vm_running = running; + + if (backend_run) { + virtio_set_status(vdev, vdev->status); + } + + if (k->vmstate_change) { + k->vmstate_change(qbus->parent, backend_run); + } + + if (!backend_run) { + virtio_set_status(vdev, vdev->status); + } +} + +void virtio_instance_init_common(Object *proxy_obj, void *data, + size_t vdev_size, const char *vdev_name) +{ + DeviceState *vdev = data; + + object_initialize_child_with_props(proxy_obj, "virtio-backend", vdev, + vdev_size, vdev_name, &error_abort, + NULL); + qdev_alias_all_properties(vdev, proxy_obj); +} + +void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + int i; + int nvectors = k->query_nvectors ? k->query_nvectors(qbus->parent) : 0; + + if (nvectors) { + vdev->vector_queues = + g_malloc0(sizeof(*vdev->vector_queues) * nvectors); + } + + vdev->start_on_kick = false; + vdev->started = false; + vdev->vhost_started = false; + vdev->device_id = device_id; + vdev->status = 0; + qatomic_set(&vdev->isr, 0); + vdev->queue_sel = 0; + vdev->config_vector = VIRTIO_NO_VECTOR; + vdev->vq = g_new0(VirtQueue, VIRTIO_QUEUE_MAX); + vdev->vm_running = runstate_is_running(); + vdev->broken = false; + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + vdev->vq[i].vector = VIRTIO_NO_VECTOR; + vdev->vq[i].vdev = vdev; + vdev->vq[i].queue_index = i; + vdev->vq[i].host_notifier_enabled = false; + } + + vdev->name = virtio_id_to_name(device_id); + vdev->config_len = config_size; + if (vdev->config_len) { + vdev->config = g_malloc0(config_size); + } else { + vdev->config = NULL; + } + vdev->vmstate = qdev_add_vm_change_state_handler(DEVICE(vdev), + virtio_vmstate_change, vdev); + vdev->device_endian = virtio_default_endian(); + vdev->use_guest_notifier_mask = true; +} + +/* + * Only devices that have already been around prior to defining the virtio + * standard support legacy mode; this includes devices not specified in the + * standard. All newer devices conform to the virtio standard only. + */ +bool virtio_legacy_allowed(VirtIODevice *vdev) +{ + switch (vdev->device_id) { + case VIRTIO_ID_NET: + case VIRTIO_ID_BLOCK: + case VIRTIO_ID_CONSOLE: + case VIRTIO_ID_RNG: + case VIRTIO_ID_BALLOON: + case VIRTIO_ID_RPMSG: + case VIRTIO_ID_SCSI: + case VIRTIO_ID_9P: + case VIRTIO_ID_RPROC_SERIAL: + case VIRTIO_ID_CAIF: + return true; + default: + return false; + } +} + +bool virtio_legacy_check_disabled(VirtIODevice *vdev) +{ + return vdev->disable_legacy_check; +} + +hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.desc; +} + +bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n) +{ + return virtio_queue_get_desc_addr(vdev, n) != 0; +} + +bool virtio_queue_enabled(VirtIODevice *vdev, int n) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->queue_enabled) { + return k->queue_enabled(qbus->parent, n); + } + return virtio_queue_enabled_legacy(vdev, n); +} + +hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.avail; +} + +hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n) +{ + return vdev->vq[n].vring.used; +} + +hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) +{ + return sizeof(VRingDesc) * vdev->vq[n].vring.num; +} + +hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n) +{ + int s; + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return sizeof(struct VRingPackedDescEvent); + } + + s = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + return offsetof(VRingAvail, ring) + + sizeof(uint16_t) * vdev->vq[n].vring.num + s; +} + +hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) +{ + int s; + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return sizeof(struct VRingPackedDescEvent); + } + + s = virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; + return offsetof(VRingUsed, ring) + + sizeof(VRingUsedElem) * vdev->vq[n].vring.num + s; +} + +static unsigned int virtio_queue_packed_get_last_avail_idx(VirtIODevice *vdev, + int n) +{ + unsigned int avail, used; + + avail = vdev->vq[n].last_avail_idx; + avail |= ((uint16_t)vdev->vq[n].last_avail_wrap_counter) << 15; + + used = vdev->vq[n].used_idx; + used |= ((uint16_t)vdev->vq[n].used_wrap_counter) << 15; + + return avail | used << 16; +} + +static uint16_t virtio_queue_split_get_last_avail_idx(VirtIODevice *vdev, + int n) +{ + return vdev->vq[n].last_avail_idx; +} + +unsigned int virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return virtio_queue_packed_get_last_avail_idx(vdev, n); + } else { + return virtio_queue_split_get_last_avail_idx(vdev, n); + } +} + +static void virtio_queue_packed_set_last_avail_idx(VirtIODevice *vdev, + int n, unsigned int idx) +{ + struct VirtQueue *vq = &vdev->vq[n]; + + vq->last_avail_idx = vq->shadow_avail_idx = idx & 0x7fff; + vq->last_avail_wrap_counter = + vq->shadow_avail_wrap_counter = !!(idx & 0x8000); + idx >>= 16; + vq->used_idx = idx & 0x7ffff; + vq->used_wrap_counter = !!(idx & 0x8000); +} + +static void virtio_queue_split_set_last_avail_idx(VirtIODevice *vdev, + int n, unsigned int idx) +{ + vdev->vq[n].last_avail_idx = idx; + vdev->vq[n].shadow_avail_idx = idx; +} + +void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, + unsigned int idx) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + virtio_queue_packed_set_last_avail_idx(vdev, n, idx); + } else { + virtio_queue_split_set_last_avail_idx(vdev, n, idx); + } +} + +static void virtio_queue_packed_restore_last_avail_idx(VirtIODevice *vdev, + int n) +{ + /* We don't have a reference like avail idx in shared memory */ + return; +} + +static void virtio_queue_split_restore_last_avail_idx(VirtIODevice *vdev, + int n) +{ + RCU_READ_LOCK_GUARD(); + if (vdev->vq[n].vring.desc) { + vdev->vq[n].last_avail_idx = vring_used_idx(&vdev->vq[n]); + vdev->vq[n].shadow_avail_idx = vdev->vq[n].last_avail_idx; + } +} + +void virtio_queue_restore_last_avail_idx(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + virtio_queue_packed_restore_last_avail_idx(vdev, n); + } else { + virtio_queue_split_restore_last_avail_idx(vdev, n); + } +} + +static void virtio_queue_packed_update_used_idx(VirtIODevice *vdev, int n) +{ + /* used idx was updated through set_last_avail_idx() */ + return; +} + +static void virtio_split_packed_update_used_idx(VirtIODevice *vdev, int n) +{ + RCU_READ_LOCK_GUARD(); + if (vdev->vq[n].vring.desc) { + vdev->vq[n].used_idx = vring_used_idx(&vdev->vq[n]); + } +} + +void virtio_queue_update_used_idx(VirtIODevice *vdev, int n) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + return virtio_queue_packed_update_used_idx(vdev, n); + } else { + return virtio_split_packed_update_used_idx(vdev, n); + } +} + +void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) +{ + vdev->vq[n].signalled_used_valid = false; +} + +VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n) +{ + return vdev->vq + n; +} + +uint16_t virtio_get_queue_index(VirtQueue *vq) +{ + return vq->queue_index; +} + +static void virtio_queue_guest_notifier_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_irq(vq); + } +} + +void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, + bool with_irqfd) +{ + if (assign && !with_irqfd) { + event_notifier_set_handler(&vq->guest_notifier, + virtio_queue_guest_notifier_read); + } else { + event_notifier_set_handler(&vq->guest_notifier, NULL); + } + if (!assign) { + /* Test and clear notifier before closing it, + * in case poll callback didn't have time to run. */ + virtio_queue_guest_notifier_read(&vq->guest_notifier); + } +} + +EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq) +{ + return &vq->guest_notifier; +} + +static void virtio_queue_host_notifier_aio_poll_begin(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + + virtio_queue_set_notification(vq, 0); +} + +static bool virtio_queue_host_notifier_aio_poll(void *opaque) +{ + EventNotifier *n = opaque; + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + + return vq->vring.desc && !virtio_queue_empty(vq); +} + +static void virtio_queue_host_notifier_aio_poll_ready(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + + virtio_queue_notify_vq(vq); +} + +static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + + /* Caller polls once more after this to catch requests that race with us */ + virtio_queue_set_notification(vq, 1); +} + +void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) +{ + aio_set_event_notifier(ctx, &vq->host_notifier, true, + virtio_queue_host_notifier_read, + virtio_queue_host_notifier_aio_poll, + virtio_queue_host_notifier_aio_poll_ready); + aio_set_event_notifier_poll(ctx, &vq->host_notifier, + virtio_queue_host_notifier_aio_poll_begin, + virtio_queue_host_notifier_aio_poll_end); +} + +/* + * Same as virtio_queue_aio_attach_host_notifier() but without polling. Use + * this for rx virtqueues and similar cases where the virtqueue handler + * function does not pop all elements. When the virtqueue is left non-empty + * polling consumes CPU cycles and should not be used. + */ +void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx) +{ + aio_set_event_notifier(ctx, &vq->host_notifier, true, + virtio_queue_host_notifier_read, + NULL, NULL); +} + +void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx) +{ + aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL, NULL); + /* Test and clear notifier before after disabling event, + * in case poll callback didn't have time to run. */ + virtio_queue_host_notifier_read(&vq->host_notifier); +} + +void virtio_queue_host_notifier_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_queue_notify_vq(vq); + } +} + +EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) +{ + return &vq->host_notifier; +} + +void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled) +{ + vq->host_notifier_enabled = enabled; +} + +int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n, + MemoryRegion *mr, bool assign) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (k->set_host_notifier_mr) { + return k->set_host_notifier_mr(qbus->parent, n, mr, assign); + } + + return -1; +} + +void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) +{ + g_free(vdev->bus_name); + vdev->bus_name = g_strdup(bus_name); +} + +void G_GNUC_PRINTF(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_vreport(fmt, ap); + va_end(ap); + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + vdev->status = vdev->status | VIRTIO_CONFIG_S_NEEDS_RESET; + virtio_notify_config(vdev); + } + + vdev->broken = true; +} + +static void virtio_memory_listener_commit(MemoryListener *listener) +{ + VirtIODevice *vdev = container_of(listener, VirtIODevice, listener); + int i; + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) { + break; + } + virtio_init_region_cache(vdev, i); + } +} + +static void virtio_device_realize(DeviceState *dev, Error **errp) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev); + Error *err = NULL; + + /* Devices should either use vmsd or the load/save methods */ + assert(!vdc->vmsd || !vdc->load); + + if (vdc->realize != NULL) { + vdc->realize(dev, &err); + if (err != NULL) { + error_propagate(errp, err); + return; + } + } + + virtio_bus_device_plugged(vdev, &err); + if (err != NULL) { + error_propagate(errp, err); + vdc->unrealize(dev); + return; + } + + vdev->listener.commit = virtio_memory_listener_commit; + vdev->listener.name = "virtio"; + memory_listener_register(&vdev->listener, vdev->dma_as); + QTAILQ_INSERT_TAIL(&virtio_list, vdev, next); +} + +static void virtio_device_unrealize(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev); + + memory_listener_unregister(&vdev->listener); + virtio_bus_device_unplugged(vdev); + + if (vdc->unrealize != NULL) { + vdc->unrealize(dev); + } + + QTAILQ_REMOVE(&virtio_list, vdev, next); + g_free(vdev->bus_name); + vdev->bus_name = NULL; +} + +static void virtio_device_free_virtqueues(VirtIODevice *vdev) +{ + int i; + if (!vdev->vq) { + return; + } + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + if (vdev->vq[i].vring.num == 0) { + break; + } + virtio_virtqueue_reset_region_cache(&vdev->vq[i]); + } + g_free(vdev->vq); +} + +static void virtio_device_instance_finalize(Object *obj) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(obj); + + virtio_device_free_virtqueues(vdev); + + g_free(vdev->config); + g_free(vdev->vector_queues); +} + +static Property virtio_properties[] = { + DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features), + DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true), + DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true), + DEFINE_PROP_BOOL("x-disable-legacy-check", VirtIODevice, + disable_legacy_check, false), + DEFINE_PROP_END_OF_LIST(), +}; + +static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev) +{ + VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); + int i, n, r, err; + + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ + memory_region_transaction_begin(); + for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { + VirtQueue *vq = &vdev->vq[n]; + if (!virtio_queue_get_num(vdev, n)) { + continue; + } + r = virtio_bus_set_host_notifier(qbus, n, true); + if (r < 0) { + err = r; + goto assign_error; + } + event_notifier_set_handler(&vq->host_notifier, + virtio_queue_host_notifier_read); + } + + for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { + /* Kick right away to begin processing requests already in vring */ + VirtQueue *vq = &vdev->vq[n]; + if (!vq->vring.num) { + continue; + } + event_notifier_set(&vq->host_notifier); + } + memory_region_transaction_commit(); + return 0; + +assign_error: + i = n; /* save n for a second iteration after transaction is committed. */ + while (--n >= 0) { + VirtQueue *vq = &vdev->vq[n]; + if (!virtio_queue_get_num(vdev, n)) { + continue; + } + + event_notifier_set_handler(&vq->host_notifier, NULL); + r = virtio_bus_set_host_notifier(qbus, n, false); + assert(r >= 0); + } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ + memory_region_transaction_commit(); + + while (--i >= 0) { + if (!virtio_queue_get_num(vdev, i)) { + continue; + } + virtio_bus_cleanup_host_notifier(qbus, i); + } + return err; +} + +int virtio_device_start_ioeventfd(VirtIODevice *vdev) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusState *vbus = VIRTIO_BUS(qbus); + + return virtio_bus_start_ioeventfd(vbus); +} + +static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev) +{ + VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev))); + int n, r; + + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ + memory_region_transaction_begin(); + for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { + VirtQueue *vq = &vdev->vq[n]; + + if (!virtio_queue_get_num(vdev, n)) { + continue; + } + event_notifier_set_handler(&vq->host_notifier, NULL); + r = virtio_bus_set_host_notifier(qbus, n, false); + assert(r >= 0); + } + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ + memory_region_transaction_commit(); + + for (n = 0; n < VIRTIO_QUEUE_MAX; n++) { + if (!virtio_queue_get_num(vdev, n)) { + continue; + } + virtio_bus_cleanup_host_notifier(qbus, n); + } +} + +int virtio_device_grab_ioeventfd(VirtIODevice *vdev) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusState *vbus = VIRTIO_BUS(qbus); + + return virtio_bus_grab_ioeventfd(vbus); +} + +void virtio_device_release_ioeventfd(VirtIODevice *vdev) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusState *vbus = VIRTIO_BUS(qbus); + + virtio_bus_release_ioeventfd(vbus); +} + +static void virtio_device_class_init(ObjectClass *klass, void *data) +{ + /* Set the default value here. */ + VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = virtio_device_realize; + dc->unrealize = virtio_device_unrealize; + dc->bus_type = TYPE_VIRTIO_BUS; + device_class_set_props(dc, virtio_properties); + vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl; + vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl; + + vdc->legacy_features |= VIRTIO_LEGACY_FEATURES; + + QTAILQ_INIT(&virtio_list); +} + +bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev) +{ + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusState *vbus = VIRTIO_BUS(qbus); + + return virtio_bus_ioeventfd_enabled(vbus); +} + +VirtioInfoList *qmp_x_query_virtio(Error **errp) +{ + VirtioInfoList *list = NULL; + VirtioInfoList *node; + VirtIODevice *vdev; + + QTAILQ_FOREACH(vdev, &virtio_list, next) { + DeviceState *dev = DEVICE(vdev); + Error *err = NULL; + QObject *obj = qmp_qom_get(dev->canonical_path, "realized", &err); + + if (err == NULL) { + GString *is_realized = qobject_to_json_pretty(obj, true); + /* virtio device is NOT realized, remove it from list */ + if (!strncmp(is_realized->str, "false", 4)) { + QTAILQ_REMOVE(&virtio_list, vdev, next); + } else { + node = g_new0(VirtioInfoList, 1); + node->value = g_new(VirtioInfo, 1); + node->value->path = g_strdup(dev->canonical_path); + node->value->name = g_strdup(vdev->name); + QAPI_LIST_PREPEND(list, node->value); + } + g_string_free(is_realized, true); + } + qobject_unref(obj); + } + + return list; +} + +static VirtIODevice *virtio_device_find(const char *path) +{ + VirtIODevice *vdev; + + QTAILQ_FOREACH(vdev, &virtio_list, next) { + DeviceState *dev = DEVICE(vdev); + + if (strcmp(dev->canonical_path, path) != 0) { + continue; + } + + Error *err = NULL; + QObject *obj = qmp_qom_get(dev->canonical_path, "realized", &err); + if (err == NULL) { + GString *is_realized = qobject_to_json_pretty(obj, true); + /* virtio device is NOT realized, remove it from list */ + if (!strncmp(is_realized->str, "false", 4)) { + g_string_free(is_realized, true); + qobject_unref(obj); + QTAILQ_REMOVE(&virtio_list, vdev, next); + return NULL; + } + g_string_free(is_realized, true); + } else { + /* virtio device doesn't exist in QOM tree */ + QTAILQ_REMOVE(&virtio_list, vdev, next); + qobject_unref(obj); + return NULL; + } + /* device exists in QOM tree & is realized */ + qobject_unref(obj); + return vdev; + } + return NULL; +} + +#define CONVERT_FEATURES(type, map, is_status, bitmap) \ + ({ \ + type *list = NULL; \ + type *node; \ + for (i = 0; map[i].virtio_bit != -1; i++) { \ + if (is_status) { \ + bit = map[i].virtio_bit; \ + } \ + else { \ + bit = 1ULL << map[i].virtio_bit; \ + } \ + if ((bitmap & bit) == 0) { \ + continue; \ + } \ + node = g_new0(type, 1); \ + node->value = g_strdup(map[i].feature_desc); \ + node->next = list; \ + list = node; \ + bitmap ^= bit; \ + } \ + list; \ + }) + +static VirtioDeviceStatus *qmp_decode_status(uint8_t bitmap) +{ + VirtioDeviceStatus *status; + uint8_t bit; + int i; + + status = g_new0(VirtioDeviceStatus, 1); + status->statuses = CONVERT_FEATURES(strList, virtio_config_status_map, + 1, bitmap); + status->has_unknown_statuses = bitmap != 0; + if (status->has_unknown_statuses) { + status->unknown_statuses = bitmap; + } + + return status; +} + +static VhostDeviceProtocols *qmp_decode_protocols(uint64_t bitmap) +{ + VhostDeviceProtocols *vhu_protocols; + uint64_t bit; + int i; + + vhu_protocols = g_new0(VhostDeviceProtocols, 1); + vhu_protocols->protocols = + CONVERT_FEATURES(strList, + vhost_user_protocol_map, 0, bitmap); + vhu_protocols->has_unknown_protocols = bitmap != 0; + if (vhu_protocols->has_unknown_protocols) { + vhu_protocols->unknown_protocols = bitmap; + } + + return vhu_protocols; +} + +static VirtioDeviceFeatures *qmp_decode_features(uint16_t device_id, + uint64_t bitmap) +{ + VirtioDeviceFeatures *features; + uint64_t bit; + int i; + + features = g_new0(VirtioDeviceFeatures, 1); + features->has_dev_features = true; + + /* transport features */ + features->transports = CONVERT_FEATURES(strList, virtio_transport_map, 0, + bitmap); + + /* device features */ + switch (device_id) { +#ifdef CONFIG_VIRTIO_SERIAL + case VIRTIO_ID_CONSOLE: + features->dev_features = + CONVERT_FEATURES(strList, virtio_serial_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_BLK + case VIRTIO_ID_BLOCK: + features->dev_features = + CONVERT_FEATURES(strList, virtio_blk_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_GPU + case VIRTIO_ID_GPU: + features->dev_features = + CONVERT_FEATURES(strList, virtio_gpu_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_NET + case VIRTIO_ID_NET: + features->dev_features = + CONVERT_FEATURES(strList, virtio_net_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_SCSI + case VIRTIO_ID_SCSI: + features->dev_features = + CONVERT_FEATURES(strList, virtio_scsi_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_BALLOON + case VIRTIO_ID_BALLOON: + features->dev_features = + CONVERT_FEATURES(strList, virtio_balloon_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_IOMMU + case VIRTIO_ID_IOMMU: + features->dev_features = + CONVERT_FEATURES(strList, virtio_iommu_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_INPUT + case VIRTIO_ID_INPUT: + features->dev_features = + CONVERT_FEATURES(strList, virtio_input_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VHOST_USER_FS + case VIRTIO_ID_FS: + features->dev_features = + CONVERT_FEATURES(strList, virtio_fs_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VHOST_VSOCK + case VIRTIO_ID_VSOCK: + features->dev_features = + CONVERT_FEATURES(strList, virtio_vsock_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_CRYPTO + case VIRTIO_ID_CRYPTO: + features->dev_features = + CONVERT_FEATURES(strList, virtio_crypto_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_MEM + case VIRTIO_ID_MEM: + features->dev_features = + CONVERT_FEATURES(strList, virtio_mem_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_I2C_ADAPTER + case VIRTIO_ID_I2C_ADAPTER: + features->dev_features = + CONVERT_FEATURES(strList, virtio_i2c_feature_map, 0, bitmap); + break; +#endif +#ifdef CONFIG_VIRTIO_RNG + case VIRTIO_ID_RNG: + features->dev_features = + CONVERT_FEATURES(strList, virtio_rng_feature_map, 0, bitmap); + break; +#endif + /* No features */ + case VIRTIO_ID_9P: + case VIRTIO_ID_PMEM: + case VIRTIO_ID_IOMEM: + case VIRTIO_ID_RPMSG: + case VIRTIO_ID_CLOCK: + case VIRTIO_ID_MAC80211_WLAN: + case VIRTIO_ID_MAC80211_HWSIM: + case VIRTIO_ID_RPROC_SERIAL: + case VIRTIO_ID_MEMORY_BALLOON: + case VIRTIO_ID_CAIF: + case VIRTIO_ID_SIGNAL_DIST: + case VIRTIO_ID_PSTORE: + case VIRTIO_ID_SOUND: + case VIRTIO_ID_BT: + case VIRTIO_ID_RPMB: + case VIRTIO_ID_VIDEO_ENCODER: + case VIRTIO_ID_VIDEO_DECODER: + case VIRTIO_ID_SCMI: + case VIRTIO_ID_NITRO_SEC_MOD: + case VIRTIO_ID_WATCHDOG: + case VIRTIO_ID_CAN: + case VIRTIO_ID_DMABUF: + case VIRTIO_ID_PARAM_SERV: + case VIRTIO_ID_AUDIO_POLICY: + case VIRTIO_ID_GPIO: + break; + default: + g_assert_not_reached(); + } + + features->has_unknown_dev_features = bitmap != 0; + if (features->has_unknown_dev_features) { + features->unknown_dev_features = bitmap; + } + + return features; +} + +VirtioStatus *qmp_x_query_virtio_status(const char *path, Error **errp) +{ + VirtIODevice *vdev; + VirtioStatus *status; + + vdev = virtio_device_find(path); + if (vdev == NULL) { + error_setg(errp, "Path %s is not a VirtIODevice", path); + return NULL; + } + + status = g_new0(VirtioStatus, 1); + status->name = g_strdup(vdev->name); + status->device_id = vdev->device_id; + status->vhost_started = vdev->vhost_started; + status->guest_features = qmp_decode_features(vdev->device_id, + vdev->guest_features); + status->host_features = qmp_decode_features(vdev->device_id, + vdev->host_features); + status->backend_features = qmp_decode_features(vdev->device_id, + vdev->backend_features); + + switch (vdev->device_endian) { + case VIRTIO_DEVICE_ENDIAN_LITTLE: + status->device_endian = g_strdup("little"); + break; + case VIRTIO_DEVICE_ENDIAN_BIG: + status->device_endian = g_strdup("big"); + break; + default: + status->device_endian = g_strdup("unknown"); + break; + } + + status->num_vqs = virtio_get_num_queues(vdev); + status->status = qmp_decode_status(vdev->status); + status->isr = vdev->isr; + status->queue_sel = vdev->queue_sel; + status->vm_running = vdev->vm_running; + status->broken = vdev->broken; + status->disabled = vdev->disabled; + status->use_started = vdev->use_started; + status->started = vdev->started; + status->start_on_kick = vdev->start_on_kick; + status->disable_legacy_check = vdev->disable_legacy_check; + status->bus_name = g_strdup(vdev->bus_name); + status->use_guest_notifier_mask = vdev->use_guest_notifier_mask; + status->has_vhost_dev = vdev->vhost_started; + + if (vdev->vhost_started) { + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + struct vhost_dev *hdev = vdc->get_vhost(vdev); + + status->vhost_dev = g_new0(VhostStatus, 1); + status->vhost_dev->n_mem_sections = hdev->n_mem_sections; + status->vhost_dev->n_tmp_sections = hdev->n_tmp_sections; + status->vhost_dev->nvqs = hdev->nvqs; + status->vhost_dev->vq_index = hdev->vq_index; + status->vhost_dev->features = + qmp_decode_features(vdev->device_id, hdev->features); + status->vhost_dev->acked_features = + qmp_decode_features(vdev->device_id, hdev->acked_features); + status->vhost_dev->backend_features = + qmp_decode_features(vdev->device_id, hdev->backend_features); + status->vhost_dev->protocol_features = + qmp_decode_protocols(hdev->protocol_features); + status->vhost_dev->max_queues = hdev->max_queues; + status->vhost_dev->backend_cap = hdev->backend_cap; + status->vhost_dev->log_enabled = hdev->log_enabled; + status->vhost_dev->log_size = hdev->log_size; + } + + return status; +} + +VirtVhostQueueStatus *qmp_x_query_virtio_vhost_queue_status(const char *path, + uint16_t queue, + Error **errp) +{ + VirtIODevice *vdev; + VirtVhostQueueStatus *status; + + vdev = virtio_device_find(path); + if (vdev == NULL) { + error_setg(errp, "Path %s is not a VirtIODevice", path); + return NULL; + } + + if (!vdev->vhost_started) { + error_setg(errp, "Error: vhost device has not started yet"); + return NULL; + } + + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + struct vhost_dev *hdev = vdc->get_vhost(vdev); + + if (queue < hdev->vq_index || queue >= hdev->vq_index + hdev->nvqs) { + error_setg(errp, "Invalid vhost virtqueue number %d", queue); + return NULL; + } + + status = g_new0(VirtVhostQueueStatus, 1); + status->name = g_strdup(vdev->name); + status->kick = hdev->vqs[queue].kick; + status->call = hdev->vqs[queue].call; + status->desc = (uintptr_t)hdev->vqs[queue].desc; + status->avail = (uintptr_t)hdev->vqs[queue].avail; + status->used = (uintptr_t)hdev->vqs[queue].used; + status->num = hdev->vqs[queue].num; + status->desc_phys = hdev->vqs[queue].desc_phys; + status->desc_size = hdev->vqs[queue].desc_size; + status->avail_phys = hdev->vqs[queue].avail_phys; + status->avail_size = hdev->vqs[queue].avail_size; + status->used_phys = hdev->vqs[queue].used_phys; + status->used_size = hdev->vqs[queue].used_size; + + return status; +} + +VirtQueueStatus *qmp_x_query_virtio_queue_status(const char *path, + uint16_t queue, + Error **errp) +{ + VirtIODevice *vdev; + VirtQueueStatus *status; + + vdev = virtio_device_find(path); + if (vdev == NULL) { + error_setg(errp, "Path %s is not a VirtIODevice", path); + return NULL; + } + + if (queue >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, queue)) { + error_setg(errp, "Invalid virtqueue number %d", queue); + return NULL; + } + + status = g_new0(VirtQueueStatus, 1); + status->name = g_strdup(vdev->name); + status->queue_index = vdev->vq[queue].queue_index; + status->inuse = vdev->vq[queue].inuse; + status->vring_num = vdev->vq[queue].vring.num; + status->vring_num_default = vdev->vq[queue].vring.num_default; + status->vring_align = vdev->vq[queue].vring.align; + status->vring_desc = vdev->vq[queue].vring.desc; + status->vring_avail = vdev->vq[queue].vring.avail; + status->vring_used = vdev->vq[queue].vring.used; + status->used_idx = vdev->vq[queue].used_idx; + status->signalled_used = vdev->vq[queue].signalled_used; + status->signalled_used_valid = vdev->vq[queue].signalled_used_valid; + + if (vdev->vhost_started) { + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + struct vhost_dev *hdev = vdc->get_vhost(vdev); + + /* check if vq index exists for vhost as well */ + if (queue >= hdev->vq_index && queue < hdev->vq_index + hdev->nvqs) { + status->has_last_avail_idx = true; + + int vhost_vq_index = + hdev->vhost_ops->vhost_get_vq_index(hdev, queue); + struct vhost_vring_state state = { + .index = vhost_vq_index, + }; + + status->last_avail_idx = + hdev->vhost_ops->vhost_get_vring_base(hdev, &state); + } + } else { + status->has_shadow_avail_idx = true; + status->has_last_avail_idx = true; + status->last_avail_idx = vdev->vq[queue].last_avail_idx; + status->shadow_avail_idx = vdev->vq[queue].shadow_avail_idx; + } + + return status; +} + +static strList *qmp_decode_vring_desc_flags(uint16_t flags) +{ + strList *list = NULL; + strList *node; + int i; + + struct { + uint16_t flag; + const char *value; + } map[] = { + { VRING_DESC_F_NEXT, "next" }, + { VRING_DESC_F_WRITE, "write" }, + { VRING_DESC_F_INDIRECT, "indirect" }, + { 1 << VRING_PACKED_DESC_F_AVAIL, "avail" }, + { 1 << VRING_PACKED_DESC_F_USED, "used" }, + { 0, "" } + }; + + for (i = 0; map[i].flag; i++) { + if ((map[i].flag & flags) == 0) { + continue; + } + node = g_malloc0(sizeof(strList)); + node->value = g_strdup(map[i].value); + node->next = list; + list = node; + } + + return list; +} + +VirtioQueueElement *qmp_x_query_virtio_queue_element(const char *path, + uint16_t queue, + bool has_index, + uint16_t index, + Error **errp) +{ + VirtIODevice *vdev; + VirtQueue *vq; + VirtioQueueElement *element = NULL; + + vdev = virtio_device_find(path); + if (vdev == NULL) { + error_setg(errp, "Path %s is not a VirtIO device", path); + return NULL; + } + + if (queue >= VIRTIO_QUEUE_MAX || !virtio_queue_get_num(vdev, queue)) { + error_setg(errp, "Invalid virtqueue number %d", queue); + return NULL; + } + vq = &vdev->vq[queue]; + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + error_setg(errp, "Packed ring not supported"); + return NULL; + } else { + unsigned int head, i, max; + VRingMemoryRegionCaches *caches; + MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID; + MemoryRegionCache *desc_cache; + VRingDesc desc; + VirtioRingDescList *list = NULL; + VirtioRingDescList *node; + int rc; int ndescs; + + RCU_READ_LOCK_GUARD(); + + max = vq->vring.num; + + if (!has_index) { + head = vring_avail_ring(vq, vq->last_avail_idx % vq->vring.num); + } else { + head = vring_avail_ring(vq, index % vq->vring.num); + } + i = head; + + caches = vring_get_region_caches(vq); + if (!caches) { + error_setg(errp, "Region caches not initialized"); + return NULL; + } + if (caches->desc.len < max * sizeof(VRingDesc)) { + error_setg(errp, "Cannot map descriptor ring"); + return NULL; + } + + desc_cache = &caches->desc; + vring_split_desc_read(vdev, &desc, desc_cache, i); + if (desc.flags & VRING_DESC_F_INDIRECT) { + int64_t len; + len = address_space_cache_init(&indirect_desc_cache, vdev->dma_as, + desc.addr, desc.len, false); + desc_cache = &indirect_desc_cache; + if (len < desc.len) { + error_setg(errp, "Cannot map indirect buffer"); + goto done; + } + + max = desc.len / sizeof(VRingDesc); + i = 0; + vring_split_desc_read(vdev, &desc, desc_cache, i); + } + + element = g_new0(VirtioQueueElement, 1); + element->avail = g_new0(VirtioRingAvail, 1); + element->used = g_new0(VirtioRingUsed, 1); + element->name = g_strdup(vdev->name); + element->index = head; + element->avail->flags = vring_avail_flags(vq); + element->avail->idx = vring_avail_idx(vq); + element->avail->ring = head; + element->used->flags = vring_used_flags(vq); + element->used->idx = vring_used_idx(vq); + ndescs = 0; + + do { + /* A buggy driver may produce an infinite loop */ + if (ndescs >= max) { + break; + } + node = g_new0(VirtioRingDescList, 1); + node->value = g_new0(VirtioRingDesc, 1); + node->value->addr = desc.addr; + node->value->len = desc.len; + node->value->flags = qmp_decode_vring_desc_flags(desc.flags); + node->next = list; + list = node; + + ndescs++; + rc = virtqueue_split_read_next_desc(vdev, &desc, desc_cache, + max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + element->descs = list; +done: + address_space_cache_destroy(&indirect_desc_cache); + } + + return element; +} + +static const TypeInfo virtio_device_info = { + .name = TYPE_VIRTIO_DEVICE, + .parent = TYPE_DEVICE, + .instance_size = sizeof(VirtIODevice), + .class_init = virtio_device_class_init, + .instance_finalize = virtio_device_instance_finalize, + .abstract = true, + .class_size = sizeof(VirtioDeviceClass), +}; + +static void virtio_register_types(void) +{ + type_register_static(&virtio_device_info); +} + +type_init(virtio_register_types) |