summaryrefslogtreecommitdiffstats
path: root/hw/remote
diff options
context:
space:
mode:
Diffstat (limited to 'hw/remote')
-rw-r--r--hw/remote/Kconfig8
-rw-r--r--hw/remote/iohub.c117
-rw-r--r--hw/remote/iommu.c131
-rw-r--r--hw/remote/machine.c160
-rw-r--r--hw/remote/memory.c62
-rw-r--r--hw/remote/meson.build17
-rw-r--r--hw/remote/message.c229
-rw-r--r--hw/remote/mpqemu-link.c263
-rw-r--r--hw/remote/proxy-memory-listener.c226
-rw-r--r--hw/remote/proxy.c386
-rw-r--r--hw/remote/remote-obj.c202
-rw-r--r--hw/remote/trace-events15
-rw-r--r--hw/remote/trace.h1
-rw-r--r--hw/remote/vfio-user-obj.c951
14 files changed, 2768 insertions, 0 deletions
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
new file mode 100644
index 00000000..2d6b4f4c
--- /dev/null
+++ b/hw/remote/Kconfig
@@ -0,0 +1,8 @@
+config MULTIPROCESS
+ bool
+ depends on PCI && PCI_EXPRESS && KVM
+ select REMOTE_PCIHOST
+
+config VFIO_USER_SERVER
+ bool
+ depends on MULTIPROCESS
diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c
new file mode 100644
index 00000000..40dfee4b
--- /dev/null
+++ b/hw/remote/iohub.c
@@ -0,0 +1,117 @@
+/*
+ * Remote IO Hub
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/pci/pci.h"
+#include "hw/pci/pci_ids.h"
+#include "hw/pci/pci_bus.h"
+#include "qemu/thread.h"
+#include "hw/remote/machine.h"
+#include "hw/remote/iohub.h"
+#include "qemu/main-loop.h"
+
+void remote_iohub_init(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
+ memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_mutex_init(&iohub->irq_level_lock[pirq]);
+ iohub->irq_level[pirq] = 0;
+ event_notifier_init_fd(&iohub->irqfds[pirq], -1);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
+ }
+}
+
+void remote_iohub_finalize(RemoteIOHubState *iohub)
+{
+ int pirq;
+
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
+ }
+}
+
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
+{
+ return pci_dev->devfn;
+}
+
+void remote_iohub_set_irq(void *opaque, int pirq, int level)
+{
+ RemoteIOHubState *iohub = opaque;
+
+ assert(pirq >= 0);
+ assert(pirq < PCI_DEVFN_MAX);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (level) {
+ if (++iohub->irq_level[pirq] == 1) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+ } else if (iohub->irq_level[pirq] > 0) {
+ iohub->irq_level[pirq]--;
+ }
+}
+
+static void intr_resample_handler(void *opaque)
+{
+ ResampleToken *token = opaque;
+ RemoteIOHubState *iohub = token->iohub;
+ int pirq, s;
+
+ pirq = token->pirq;
+
+ s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
+
+ assert(s >= 0);
+
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
+
+ if (iohub->irq_level[pirq]) {
+ event_notifier_set(&iohub->irqfds[pirq]);
+ }
+}
+
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
+{
+ RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
+ RemoteIOHubState *iohub = &machine->iohub;
+ int pirq, intx;
+
+ intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ pirq = remote_iohub_map_irq(pci_dev, intx);
+
+ if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
+ NULL, NULL, NULL);
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
+ memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
+ }
+
+ event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
+ event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
+
+ iohub->token[pirq].iohub = iohub;
+ iohub->token[pirq].pirq = pirq;
+
+ qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
+ &iohub->token[pirq]);
+}
diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c
new file mode 100644
index 00000000..1391dd71
--- /dev/null
+++ b/hw/remote/iommu.c
@@ -0,0 +1,131 @@
+/**
+ * IOMMU for remote device
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "trace.h"
+
+/**
+ * IOMMU for TYPE_REMOTE_MACHINE - manages DMA address space isolation
+ * for remote machine. It is used by TYPE_VFIO_USER_SERVER.
+ *
+ * - Each TYPE_VFIO_USER_SERVER instance handles one PCIDevice on a PCIBus.
+ * There is one RemoteIommu per PCIBus, so the RemoteIommu tracks multiple
+ * PCIDevices by maintaining a ->elem_by_devfn mapping.
+ *
+ * - memory_region_init_iommu() is not used because vfio-user MemoryRegions
+ * will be added to the elem->mr container instead. This is more natural
+ * than implementing the IOMMUMemoryRegionClass APIs since vfio-user
+ * provides something that is close to a full-fledged MemoryRegion and
+ * not like an IOMMU mapping.
+ *
+ * - When a device is hot unplugged, the elem->mr reference is dropped so
+ * all vfio-user MemoryRegions associated with this vfio-user server are
+ * destroyed.
+ */
+
+static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus,
+ void *opaque, int devfn)
+{
+ RemoteIommu *iommu = opaque;
+ RemoteIommuElem *elem = NULL;
+
+ qemu_mutex_lock(&iommu->lock);
+
+ elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn));
+
+ if (!elem) {
+ elem = g_new0(RemoteIommuElem, 1);
+ g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem);
+ }
+
+ if (!elem->mr) {
+ elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION));
+ memory_region_set_size(elem->mr, UINT64_MAX);
+ address_space_init(&elem->as, elem->mr, NULL);
+ }
+
+ qemu_mutex_unlock(&iommu->lock);
+
+ return &elem->as;
+}
+
+void remote_iommu_unplug_dev(PCIDevice *pci_dev)
+{
+ AddressSpace *as = pci_device_iommu_address_space(pci_dev);
+ RemoteIommuElem *elem = NULL;
+
+ if (as == &address_space_memory) {
+ return;
+ }
+
+ elem = container_of(as, RemoteIommuElem, as);
+
+ address_space_destroy(&elem->as);
+
+ object_unref(elem->mr);
+
+ elem->mr = NULL;
+}
+
+static void remote_iommu_init(Object *obj)
+{
+ RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+ iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free);
+
+ qemu_mutex_init(&iommu->lock);
+}
+
+static void remote_iommu_finalize(Object *obj)
+{
+ RemoteIommu *iommu = REMOTE_IOMMU(obj);
+
+ qemu_mutex_destroy(&iommu->lock);
+
+ g_hash_table_destroy(iommu->elem_by_devfn);
+
+ iommu->elem_by_devfn = NULL;
+}
+
+void remote_iommu_setup(PCIBus *pci_bus)
+{
+ RemoteIommu *iommu = NULL;
+
+ g_assert(pci_bus);
+
+ iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU));
+
+ pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu);
+
+ object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu));
+
+ object_unref(OBJECT(iommu));
+}
+
+static const TypeInfo remote_iommu_info = {
+ .name = TYPE_REMOTE_IOMMU,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(RemoteIommu),
+ .instance_init = remote_iommu_init,
+ .instance_finalize = remote_iommu_finalize,
+};
+
+static void remote_iommu_register_types(void)
+{
+ type_register_static(&remote_iommu_info);
+}
+
+type_init(remote_iommu_register_types)
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
new file mode 100644
index 00000000..75d550da
--- /dev/null
+++ b/hw/remote/machine.c
@@ -0,0 +1,160 @@
+/*
+ * Machine for remote device
+ *
+ * This machine type is used by the remote device process in multi-process
+ * QEMU. QEMU device models depend on parent busses, interrupt controllers,
+ * memory regions, etc. The remote machine type offers this environment so
+ * that QEMU device models can be used as remote devices.
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/machine.h"
+#include "exec/memory.h"
+#include "qapi/error.h"
+#include "hw/pci/pci_host.h"
+#include "hw/remote/iohub.h"
+#include "hw/remote/iommu.h"
+#include "hw/qdev-core.h"
+#include "hw/remote/iommu.h"
+#include "hw/remote/vfio-user-obj.h"
+#include "hw/pci/msi.h"
+
+static void remote_machine_init(MachineState *machine)
+{
+ MemoryRegion *system_memory, *system_io, *pci_memory;
+ RemoteMachineState *s = REMOTE_MACHINE(machine);
+ RemotePCIHost *rem_host;
+ PCIHostState *pci_host;
+
+ system_memory = get_system_memory();
+ system_io = get_system_io();
+
+ pci_memory = g_new(MemoryRegion, 1);
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
+
+ rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
+
+ rem_host->mr_pci_mem = pci_memory;
+ rem_host->mr_sys_mem = system_memory;
+ rem_host->mr_sys_io = system_io;
+
+ s->host = rem_host;
+
+ object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
+ memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
+
+ qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
+
+ pci_host = PCI_HOST_BRIDGE(rem_host);
+
+ if (s->vfio_user) {
+ remote_iommu_setup(pci_host->bus);
+
+ msi_nonbroken = true;
+
+ vfu_object_set_bus_irq(pci_host->bus);
+ } else {
+ remote_iohub_init(&s->iohub);
+
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
+ }
+
+ qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s));
+}
+
+static bool remote_machine_get_vfio_user(Object *obj, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ return s->vfio_user;
+}
+
+static void remote_machine_set_vfio_user(Object *obj, bool value, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ if (phase_check(PHASE_MACHINE_CREATED)) {
+ error_setg(errp, "Error enabling vfio-user - machine already created");
+ return;
+ }
+
+ s->vfio_user = value;
+}
+
+static bool remote_machine_get_auto_shutdown(Object *obj, Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ return s->auto_shutdown;
+}
+
+static void remote_machine_set_auto_shutdown(Object *obj, bool value,
+ Error **errp)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ s->auto_shutdown = value;
+}
+
+static void remote_machine_instance_init(Object *obj)
+{
+ RemoteMachineState *s = REMOTE_MACHINE(obj);
+
+ s->auto_shutdown = true;
+}
+
+static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev,
+ DeviceState *dev, Error **errp)
+{
+ qdev_unrealize(dev);
+
+ if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ remote_iommu_unplug_dev(PCI_DEVICE(dev));
+ }
+}
+
+static void remote_machine_class_init(ObjectClass *oc, void *data)
+{
+ MachineClass *mc = MACHINE_CLASS(oc);
+ HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
+
+ mc->init = remote_machine_init;
+ mc->desc = "Experimental remote machine";
+
+ hc->unplug = remote_machine_dev_unplug_cb;
+
+ object_class_property_add_bool(oc, "vfio-user",
+ remote_machine_get_vfio_user,
+ remote_machine_set_vfio_user);
+
+ object_class_property_add_bool(oc, "auto-shutdown",
+ remote_machine_get_auto_shutdown,
+ remote_machine_set_auto_shutdown);
+}
+
+static const TypeInfo remote_machine = {
+ .name = TYPE_REMOTE_MACHINE,
+ .parent = TYPE_MACHINE,
+ .instance_size = sizeof(RemoteMachineState),
+ .instance_init = remote_machine_instance_init,
+ .class_init = remote_machine_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_HOTPLUG_HANDLER },
+ { }
+ }
+};
+
+static void remote_machine_register_types(void)
+{
+ type_register_static(&remote_machine);
+}
+
+type_init(remote_machine_register_types);
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
new file mode 100644
index 00000000..6d60da91
--- /dev/null
+++ b/hw/remote/memory.c
@@ -0,0 +1,62 @@
+/*
+ * Memory manager for remote device
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/memory.h"
+#include "exec/ram_addr.h"
+#include "qapi/error.h"
+
+static void remote_sysmem_reset(void)
+{
+ MemoryRegion *sysmem, *subregion, *next;
+
+ sysmem = get_system_memory();
+
+ QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
+ if (subregion->ram) {
+ memory_region_del_subregion(sysmem, subregion);
+ object_unparent(OBJECT(subregion));
+ }
+ }
+}
+
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
+ MemoryRegion *sysmem, *subregion;
+ static unsigned int suffix;
+ int region;
+
+ sysmem = get_system_memory();
+
+ remote_sysmem_reset();
+
+ for (region = 0; region < msg->num_fds; region++, suffix++) {
+ g_autofree char *name = g_strdup_printf("remote-mem-%u", suffix);
+ subregion = g_new(MemoryRegion, 1);
+ memory_region_init_ram_from_fd(subregion, NULL,
+ name, sysmem_info->sizes[region],
+ RAM_SHARED, msg->fds[region],
+ sysmem_info->offsets[region],
+ errp);
+
+ if (*errp) {
+ g_free(subregion);
+ remote_sysmem_reset();
+ return;
+ }
+
+ memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
+ subregion);
+
+ }
+}
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
new file mode 100644
index 00000000..ab25c049
--- /dev/null
+++ b/hw/remote/meson.build
@@ -0,0 +1,17 @@
+remote_ss = ss.source_set()
+
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c'))
+remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c'))
+
+remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: libvfio_user_dep)
+
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
+
+softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
diff --git a/hw/remote/message.c b/hw/remote/message.c
new file mode 100644
index 00000000..50f6bf2d
--- /dev/null
+++ b/hw/remote/message.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/machine.h"
+#include "io/channel.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "sysemu/runstate.h"
+#include "hw/pci/pci.h"
+#include "exec/memattrs.h"
+#include "hw/remote/memory.h"
+#include "hw/remote/iohub.h"
+#include "sysemu/reset.h"
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp);
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp);
+
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
+{
+ g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
+ PCIDevice *pci_dev = NULL;
+ Error *local_err = NULL;
+
+ assert(com->ioc);
+
+ pci_dev = com->dev;
+ for (; !local_err;) {
+ MPQemuMsg msg = {0};
+
+ if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
+ break;
+ }
+
+ if (!mpqemu_msg_valid(&msg)) {
+ error_setg(&local_err, "Received invalid message from proxy"
+ "in remote process pid="FMT_pid"",
+ getpid());
+ break;
+ }
+
+ switch (msg.cmd) {
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ process_config_write(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_PCI_CFGREAD:
+ process_config_read(com->ioc, pci_dev, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ process_bar_write(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_BAR_READ:
+ process_bar_read(com->ioc, &msg, &local_err);
+ break;
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ remote_sysmem_reconfig(&msg, &local_err);
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ process_set_irqfd_msg(pci_dev, &msg);
+ break;
+ case MPQEMU_CMD_DEVICE_RESET:
+ process_device_reset_msg(com->ioc, pci_dev, &local_err);
+ break;
+ default:
+ error_setg(&local_err,
+ "Unknown command (%d) received for device %s"
+ " (pid="FMT_pid")",
+ msg.cmd, DEVICE(pci_dev)->id, getpid());
+ }
+ }
+
+ if (local_err) {
+ error_report_err(local_err);
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
+ } else {
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ }
+}
+
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ pci_default_write_config(dev, conf->addr, conf->val, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
+ MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
+ MPQemuMsg ret = { 0 };
+
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
+ error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
+ getpid());
+ ret.data.u64 = UINT64_MAX;
+ } else {
+ ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ AddressSpace *as =
+ bar_access->memory ? &address_space_memory : &address_space_io;
+ MPQemuMsg ret = { 0 };
+ MemTxResult res;
+ uint64_t val;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ ret.data.u64 = UINT64_MAX;
+ goto fail;
+ }
+
+ val = cpu_to_le64(bar_access->val);
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, true);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ ret.data.u64 = -1;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
+{
+ ERRP_GUARD();
+ BarAccessMsg *bar_access = &msg->data.bar_access;
+ MPQemuMsg ret = { 0 };
+ AddressSpace *as;
+ MemTxResult res;
+ uint64_t val = 0;
+
+ as = bar_access->memory ? &address_space_memory : &address_space_io;
+
+ if (!is_power_of_2(bar_access->size) ||
+ (bar_access->size > sizeof(uint64_t))) {
+ val = UINT64_MAX;
+ goto fail;
+ }
+
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
+ (void *)&val, bar_access->size, false);
+
+ if (res != MEMTX_OK) {
+ error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
+ bar_access->addr, getpid());
+ val = UINT64_MAX;
+ }
+
+fail:
+ ret.cmd = MPQEMU_CMD_RET;
+ ret.data.u64 = le64_to_cpu(val);
+ ret.size = sizeof(ret.data.u64);
+
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
+ getpid());
+ }
+}
+
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
+ Error **errp)
+{
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
+ DeviceState *s = DEVICE(dev);
+ MPQemuMsg ret = { 0 };
+
+ if (dc->reset) {
+ dc->reset(s);
+ }
+
+ ret.cmd = MPQEMU_CMD_RET;
+
+ mpqemu_msg_send(&ret, ioc, errp);
+}
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
new file mode 100644
index 00000000..9bd98e82
--- /dev/null
+++ b/hw/remote/mpqemu-link.c
@@ -0,0 +1,263 @@
+/*
+ * Communication channel between QEMU and remote device process
+ *
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/module.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "io/channel.h"
+#include "sysemu/iothread.h"
+#include "trace.h"
+
+/*
+ * Send message over the ioc QIOChannel.
+ * This function is safe to call from:
+ * - main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ * Returns true if no errors were encountered, false otherwise.
+ */
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ struct iovec send[2] = {};
+ int *fds = NULL;
+ size_t nfds = 0;
+ bool ret = false;
+
+ send[0].iov_base = msg;
+ send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
+
+ send[1].iov_base = (void *)&msg->data;
+ send[1].iov_len = msg->size;
+
+ if (msg->num_fds) {
+ nfds = msg->num_fds;
+ fds = msg->fds;
+ }
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ /*
+ * Skip unlocking/locking iothread lock when the IOThread is running
+ * in co-routine context. Co-routine context is asserted above
+ * for IOThread case.
+ * Also skip lock handling while in a co-routine in the main context.
+ */
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
+ fds, nfds, 0, errp)) {
+ ret = true;
+ } else {
+ trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
+ }
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ /* See above comment why skip locking here. */
+ qemu_mutex_lock_iothread();
+ }
+
+ return ret;
+}
+
+/*
+ * Read message from the ioc QIOChannel.
+ * This function is safe to call from:
+ * - From main loop in co-routine context. Will block the main loop if not in
+ * co-routine context;
+ * - From vCPU thread with no co-routine context and if the channel is not part
+ * of the main loop handling;
+ * - From IOThread within co-routine context, outside of co-routine context
+ * will block IOThread;
+ */
+static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
+ size_t *nfds, Error **errp)
+{
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
+ bool iolock = qemu_mutex_iothread_locked();
+ bool iothread = qemu_in_iothread();
+ int ret = -1;
+
+ /*
+ * Dont use in IOThread out of co-routine context as
+ * it will block IOThread.
+ */
+ assert(qemu_in_coroutine() || !iothread);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_unlock_iothread();
+ }
+
+ ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
+
+ if (iolock && !iothread && !qemu_in_coroutine()) {
+ qemu_mutex_lock_iothread();
+ }
+
+ return (ret <= 0) ? ret : iov.iov_len;
+}
+
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
+{
+ ERRP_GUARD();
+ g_autofree int *fds = NULL;
+ size_t nfds = 0;
+ ssize_t len;
+ bool ret = false;
+
+ len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
+ if (len <= 0) {
+ goto fail;
+ } else if (len != MPQEMU_MSG_HDR_SIZE) {
+ error_setg(errp, "Message header corrupted");
+ goto fail;
+ }
+
+ if (msg->size > sizeof(msg->data)) {
+ error_setg(errp, "Invalid size for message");
+ goto fail;
+ }
+
+ if (!msg->size) {
+ goto copy_fds;
+ }
+
+ len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
+ if (len <= 0) {
+ goto fail;
+ }
+ if (len != msg->size) {
+ error_setg(errp, "Unable to read full message");
+ goto fail;
+ }
+
+copy_fds:
+ msg->num_fds = nfds;
+ if (nfds > G_N_ELEMENTS(msg->fds)) {
+ error_setg(errp,
+ "Overflow error: received %zu fds, more than max of %d fds",
+ nfds, REMOTE_MAX_FDS);
+ goto fail;
+ }
+ if (nfds) {
+ memcpy(msg->fds, fds, nfds * sizeof(int));
+ }
+
+ ret = true;
+
+fail:
+ if (*errp) {
+ trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
+ }
+ while (*errp && nfds) {
+ close(fds[nfds - 1]);
+ nfds--;
+ }
+
+ return ret;
+}
+
+/*
+ * Send msg and wait for a reply with command code RET_MSG.
+ * Returns the message received of size u64 or UINT64_MAX
+ * on error.
+ * Called from VCPU thread in non-coroutine context.
+ * Used by the Proxy object to communicate to remote processes.
+ */
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
+ Error **errp)
+{
+ MPQemuMsg msg_reply = {0};
+ uint64_t ret = UINT64_MAX;
+
+ assert(!qemu_in_coroutine());
+
+ QEMU_LOCK_GUARD(&pdev->io_mutex);
+ if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
+ return ret;
+ }
+
+ if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
+ error_setg(errp, "ERROR: Invalid reply received for command %d",
+ msg->cmd);
+ return ret;
+ }
+
+ return msg_reply.data.u64;
+}
+
+bool mpqemu_msg_valid(MPQemuMsg *msg)
+{
+ if (msg->cmd >= MPQEMU_CMD_MAX || msg->cmd < 0) {
+ return false;
+ }
+
+ /* Verify FDs. */
+ if (msg->num_fds >= REMOTE_MAX_FDS) {
+ return false;
+ }
+
+ if (msg->num_fds > 0) {
+ for (int i = 0; i < msg->num_fds; i++) {
+ if (fcntl(msg->fds[i], F_GETFL) == -1) {
+ return false;
+ }
+ }
+ }
+
+ /* Verify message specific fields. */
+ switch (msg->cmd) {
+ case MPQEMU_CMD_SYNC_SYSMEM:
+ if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_PCI_CFGWRITE:
+ case MPQEMU_CMD_PCI_CFGREAD:
+ if (msg->size != sizeof(PciConfDataMsg)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_BAR_WRITE:
+ case MPQEMU_CMD_BAR_READ:
+ if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
+ return false;
+ }
+ break;
+ case MPQEMU_CMD_SET_IRQFD:
+ if (msg->size || (msg->num_fds != 2)) {
+ return false;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c
new file mode 100644
index 00000000..eb9918fe
--- /dev/null
+++ b/hw/remote/proxy-memory-listener.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/compiler.h"
+#include "qemu/int128.h"
+#include "qemu/range.h"
+#include "exec/memory.h"
+#include "exec/cpu-common.h"
+#include "exec/ram_addr.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/remote/mpqemu-link.h"
+#include "hw/remote/proxy-memory-listener.h"
+
+/*
+ * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
+ * proxy_memory_listener_commit() defined below perform tasks similar to the
+ * functions defined in vhost-user.c. These functions are good candidates
+ * for refactoring.
+ *
+ */
+
+static void proxy_memory_listener_reset(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ int mrs;
+
+ for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
+ memory_region_unref(proxy_listener->mr_sections[mrs].mr);
+ }
+
+ g_free(proxy_listener->mr_sections);
+ proxy_listener->mr_sections = NULL;
+ proxy_listener->n_mr_sections = 0;
+}
+
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
+{
+ MemoryRegion *mr;
+ ram_addr_t off;
+
+ /**
+ * Assumes that the host address is a valid address as it's
+ * coming from the MemoryListener system. In the case host
+ * address is not valid, the following call would return
+ * the default subregion of "system_memory" region, and
+ * not NULL. So it's not possible to check for NULL here.
+ */
+ mr = memory_region_from_host((void *)(uintptr_t)host, &off);
+
+ if (offset) {
+ *offset = off;
+ }
+
+ return memory_region_get_fd(mr);
+}
+
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
+{
+ if (((prev_host + size) != host)) {
+ return false;
+ }
+
+ if (get_fd_from_hostaddr(host, NULL) !=
+ get_fd_from_hostaddr(prev_host, NULL)) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool try_merge(ProxyMemoryListener *proxy_listener,
+ MemoryRegionSection *section)
+{
+ uint64_t mrs_size, mrs_gpa, mrs_page;
+ MemoryRegionSection *prev_sec;
+ bool merged = false;
+ uintptr_t mrs_host;
+ RAMBlock *mrs_rb;
+
+ if (!proxy_listener->n_mr_sections) {
+ return false;
+ }
+
+ mrs_rb = section->mr->ram_block;
+ mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
+ mrs_size = int128_get64(section->size);
+ mrs_gpa = section->offset_within_address_space;
+ mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+
+ if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
+ return true;
+ }
+
+ mrs_host = mrs_host & ~(mrs_page - 1);
+ mrs_gpa = mrs_gpa & ~(mrs_page - 1);
+ mrs_size = ROUND_UP(mrs_size, mrs_page);
+
+ prev_sec = proxy_listener->mr_sections +
+ (proxy_listener->n_mr_sections - 1);
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
+ uint64_t prev_size = int128_get64(prev_sec->size);
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
+ uint64_t prev_host_start =
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
+ prev_sec->offset_within_region;
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
+
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
+ g_assert(mrs_gpa > prev_gpa_start);
+
+ if ((section->mr == prev_sec->mr) &&
+ proxy_mrs_can_merge(mrs_host, prev_host_start,
+ (mrs_gpa - prev_gpa_start))) {
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
+ merged = true;
+ prev_sec->offset_within_address_space =
+ MIN(prev_gpa_start, mrs_gpa);
+ prev_sec->offset_within_region =
+ MIN(prev_host_start, mrs_host) -
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
+ mrs_host));
+ }
+ }
+
+ return merged;
+}
+
+static void proxy_memory_listener_region_addnop(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+
+ if (!memory_region_is_ram(section->mr) ||
+ memory_region_is_rom(section->mr)) {
+ return;
+ }
+
+ if (try_merge(proxy_listener, section)) {
+ return;
+ }
+
+ ++proxy_listener->n_mr_sections;
+ proxy_listener->mr_sections = g_renew(MemoryRegionSection,
+ proxy_listener->mr_sections,
+ proxy_listener->n_mr_sections);
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
+ memory_region_ref(section->mr);
+}
+
+static void proxy_memory_listener_commit(MemoryListener *listener)
+{
+ ProxyMemoryListener *proxy_listener = container_of(listener,
+ ProxyMemoryListener,
+ listener);
+ MPQemuMsg msg;
+ MemoryRegionSection *section;
+ ram_addr_t offset;
+ uintptr_t host_addr;
+ int region;
+ Error *local_err = NULL;
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+
+ msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
+ msg.num_fds = proxy_listener->n_mr_sections;
+ msg.size = sizeof(SyncSysmemMsg);
+ if (msg.num_fds > REMOTE_MAX_FDS) {
+ error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
+ return;
+ }
+
+ for (region = 0; region < proxy_listener->n_mr_sections; region++) {
+ section = &proxy_listener->mr_sections[region];
+ msg.data.sync_sysmem.gpas[region] =
+ section->offset_within_address_space;
+ msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
+ host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
+ section->offset_within_region;
+ msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
+ msg.data.sync_sysmem.offsets[region] = offset;
+ }
+ if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+}
+
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
+{
+ memory_listener_unregister(&proxy_listener->listener);
+
+ proxy_memory_listener_reset(&proxy_listener->listener);
+}
+
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
+ QIOChannel *ioc)
+{
+ proxy_listener->n_mr_sections = 0;
+ proxy_listener->mr_sections = NULL;
+
+ proxy_listener->ioc = ioc;
+
+ proxy_listener->listener.begin = proxy_memory_listener_reset;
+ proxy_listener->listener.commit = proxy_memory_listener_commit;
+ proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
+ proxy_listener->listener.priority = 10;
+ proxy_listener->listener.name = "proxy";
+
+ memory_listener_register(&proxy_listener->listener,
+ &address_space_memory);
+}
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
new file mode 100644
index 00000000..1c7786b5
--- /dev/null
+++ b/hw/remote/proxy.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/remote/proxy.h"
+#include "hw/pci/pci.h"
+#include "qapi/error.h"
+#include "io/channel-util.h"
+#include "hw/qdev-properties.h"
+#include "monitor/monitor.h"
+#include "migration/blocker.h"
+#include "qemu/sockets.h"
+#include "hw/remote/mpqemu-link.h"
+#include "qemu/error-report.h"
+#include "hw/remote/proxy-memory-listener.h"
+#include "qom/object.h"
+#include "qemu/event_notifier.h"
+#include "sysemu/kvm.h"
+#include "util/event_notifier-posix.c"
+
+static void probe_pci_info(PCIDevice *dev, Error **errp);
+static void proxy_device_reset(DeviceState *dev);
+
+static void proxy_intx_update(PCIDevice *pci_dev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
+ PCIINTxRoute route;
+ int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
+ dev->virq = -1;
+ }
+
+ route = pci_device_route_intx_to_irq(pci_dev, pin);
+
+ dev->virq = route.irq;
+
+ if (dev->virq != -1) {
+ kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
+ &dev->resample, dev->virq);
+ }
+}
+
+static void setup_irqfd(PCIProxyDev *dev)
+{
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
+ MPQemuMsg msg;
+ Error *local_err = NULL;
+
+ event_notifier_init(&dev->intr, 0);
+ event_notifier_init(&dev->resample, 0);
+
+ memset(&msg, 0, sizeof(MPQemuMsg));
+ msg.cmd = MPQEMU_CMD_SET_IRQFD;
+ msg.num_fds = 2;
+ msg.fds[0] = event_notifier_get_fd(&dev->intr);
+ msg.fds[1] = event_notifier_get_fd(&dev->resample);
+ msg.size = 0;
+
+ if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
+ error_report_err(local_err);
+ }
+
+ dev->virq = -1;
+
+ proxy_intx_update(pci_dev);
+
+ pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
+}
+
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
+{
+ ERRP_GUARD();
+ PCIProxyDev *dev = PCI_PROXY_DEV(device);
+ uint8_t *pci_conf = device->config;
+ int fd;
+
+ if (!dev->fd) {
+ error_setg(errp, "fd parameter not specified for %s",
+ DEVICE(device)->id);
+ return;
+ }
+
+ fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
+ if (fd == -1) {
+ error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "proxy: fd %d is not a socket", fd);
+ close(fd);
+ return;
+ }
+
+ dev->ioc = qio_channel_new_fd(fd, errp);
+ if (!dev->ioc) {
+ close(fd);
+ return;
+ }
+
+ error_setg(&dev->migration_blocker, "%s does not support migration",
+ TYPE_PCI_PROXY_DEV);
+ if (migrate_add_blocker(dev->migration_blocker, errp) < 0) {
+ error_free(dev->migration_blocker);
+ object_unref(dev->ioc);
+ return;
+ }
+
+ qemu_mutex_init(&dev->io_mutex);
+ qio_channel_set_blocking(dev->ioc, true, NULL);
+
+ pci_conf[PCI_LATENCY_TIMER] = 0xff;
+ pci_conf[PCI_INTERRUPT_PIN] = 0x01;
+
+ proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
+
+ setup_irqfd(dev);
+
+ probe_pci_info(PCI_DEVICE(dev), errp);
+}
+
+static void pci_proxy_dev_exit(PCIDevice *pdev)
+{
+ PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
+
+ if (dev->ioc) {
+ qio_channel_close(dev->ioc, NULL);
+ }
+
+ migrate_del_blocker(dev->migration_blocker);
+
+ error_free(dev->migration_blocker);
+
+ proxy_memory_listener_deconfigure(&dev->proxy_listener);
+
+ event_notifier_cleanup(&dev->intr);
+ event_notifier_cleanup(&dev->resample);
+}
+
+static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
+ int len, unsigned int op)
+{
+ MPQemuMsg msg = { 0 };
+ uint64_t ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.cmd = op;
+ msg.data.pci_conf_data.addr = addr;
+ msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
+ msg.data.pci_conf_data.len = len;
+ msg.size = sizeof(PciConfDataMsg);
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (ret == UINT64_MAX) {
+ error_report("Failed to perform PCI config %s operation",
+ (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
+ }
+
+ if (op == MPQEMU_CMD_PCI_CFGREAD) {
+ *val = (uint32_t)ret;
+ }
+}
+
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+ uint32_t val;
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
+
+ return val;
+}
+
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
+ int len)
+{
+ /*
+ * Some of the functions access the copy of remote device's PCI config
+ * space which is cached in the proxy device. Therefore, maintain
+ * it updated.
+ */
+ pci_default_write_config(d, addr, val, len);
+
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
+}
+
+static Property proxy_properties[] = {
+ DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+ k->realize = pci_proxy_dev_realize;
+ k->exit = pci_proxy_dev_exit;
+ k->config_read = pci_proxy_read_config;
+ k->config_write = pci_proxy_write_config;
+
+ dc->reset = proxy_device_reset;
+
+ device_class_set_props(dc, proxy_properties);
+}
+
+static const TypeInfo pci_proxy_dev_type_info = {
+ .name = TYPE_PCI_PROXY_DEV,
+ .parent = TYPE_PCI_DEVICE,
+ .instance_size = sizeof(PCIProxyDev),
+ .class_init = pci_proxy_dev_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
+ { },
+ },
+};
+
+static void pci_proxy_dev_register_types(void)
+{
+ type_register_static(&pci_proxy_dev_type_info);
+}
+
+type_init(pci_proxy_dev_register_types)
+
+static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
+ bool write, hwaddr addr, uint64_t *val,
+ unsigned size, bool memory)
+{
+ MPQemuMsg msg = { 0 };
+ long ret = -EINVAL;
+ Error *local_err = NULL;
+
+ msg.size = sizeof(BarAccessMsg);
+ msg.data.bar_access.addr = mr->addr + addr;
+ msg.data.bar_access.size = size;
+ msg.data.bar_access.memory = memory;
+
+ if (write) {
+ msg.cmd = MPQEMU_CMD_BAR_WRITE;
+ msg.data.bar_access.val = *val;
+ } else {
+ msg.cmd = MPQEMU_CMD_BAR_READ;
+ }
+
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ if (!write) {
+ *val = ret;
+ }
+}
+
+static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
+ unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
+ pmr->memory);
+}
+
+static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
+{
+ ProxyMemoryRegion *pmr = opaque;
+ uint64_t val;
+
+ send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
+ pmr->memory);
+
+ return val;
+}
+
+const MemoryRegionOps proxy_mr_ops = {
+ .read = proxy_bar_read,
+ .write = proxy_bar_write,
+ .endianness = DEVICE_NATIVE_ENDIAN,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 8,
+ },
+};
+
+static void probe_pci_info(PCIDevice *dev, Error **errp)
+{
+ PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
+ uint32_t orig_val, new_val, base_class, val;
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ DeviceClass *dc = DEVICE_CLASS(pc);
+ uint8_t type;
+ int i, size;
+
+ config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->vendor_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->device_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->class_id = (uint16_t)val;
+
+ config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
+ pc->subsystem_id = (uint16_t)val;
+
+ base_class = pc->class_id >> 4;
+ switch (base_class) {
+ case PCI_BASE_CLASS_BRIDGE:
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_STORAGE:
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ break;
+ case PCI_BASE_CLASS_NETWORK:
+ case PCI_BASE_CLASS_WIRELESS:
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
+ break;
+ case PCI_BASE_CLASS_INPUT:
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+ break;
+ case PCI_BASE_CLASS_DISPLAY:
+ set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+ break;
+ case PCI_BASE_CLASS_PROCESSOR:
+ set_bit(DEVICE_CATEGORY_CPU, dc->categories);
+ break;
+ default:
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+ break;
+ }
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ new_val = 0xffffffff;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
+ MPQEMU_CMD_PCI_CFGREAD);
+ size = (~(new_val & 0xFFFFFFF0)) + 1;
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
+ MPQEMU_CMD_PCI_CFGWRITE);
+ type = (new_val & 0x1) ?
+ PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
+
+ if (size) {
+ g_autofree char *name = g_strdup_printf("bar-region-%d", i);
+ pdev->region[i].dev = pdev;
+ pdev->region[i].present = true;
+ if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+ pdev->region[i].memory = true;
+ }
+ memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
+ &proxy_mr_ops, &pdev->region[i],
+ name, size);
+ pci_register_bar(dev, i, type, &pdev->region[i].mr);
+ }
+ }
+}
+
+static void proxy_device_reset(DeviceState *dev)
+{
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
+ MPQemuMsg msg = { 0 };
+ Error *local_err = NULL;
+
+ msg.cmd = MPQEMU_CMD_DEVICE_RESET;
+ msg.size = 0;
+
+ mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+}
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
new file mode 100644
index 00000000..333e5ac4
--- /dev/null
+++ b/hw/remote/remote-obj.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/error-report.h"
+#include "qemu/notify.h"
+#include "qom/object_interfaces.h"
+#include "hw/qdev-core.h"
+#include "io/channel.h"
+#include "hw/qdev-core.h"
+#include "hw/remote/machine.h"
+#include "io/channel-util.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+#include "hw/pci/pci.h"
+#include "qemu/sockets.h"
+#include "monitor/monitor.h"
+
+#define TYPE_REMOTE_OBJECT "x-remote-object"
+OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
+
+struct RemoteObjectClass {
+ ObjectClass parent_class;
+
+ unsigned int nr_devs;
+ unsigned int max_devs;
+};
+
+struct RemoteObject {
+ /* private */
+ Object parent;
+
+ Notifier machine_done;
+
+ int32_t fd;
+ char *devid;
+
+ QIOChannel *ioc;
+
+ DeviceState *dev;
+ DeviceListener listener;
+};
+
+static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+ int fd = -1;
+
+ fd = monitor_fd_param(monitor_cur(), str, errp);
+ if (fd == -1) {
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
+ return;
+ }
+
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "File descriptor '%s' is not a socket", str);
+ close(fd);
+ return;
+ }
+
+ o->fd = fd;
+}
+
+static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
+{
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ g_free(o->devid);
+
+ o->devid = g_strdup(str);
+}
+
+static void remote_object_unrealize_listener(DeviceListener *listener,
+ DeviceState *dev)
+{
+ RemoteObject *o = container_of(listener, RemoteObject, listener);
+
+ if (o->dev == dev) {
+ object_unref(OBJECT(o));
+ }
+}
+
+static void remote_object_machine_done(Notifier *notifier, void *data)
+{
+ RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
+ DeviceState *dev = NULL;
+ QIOChannel *ioc = NULL;
+ Coroutine *co = NULL;
+ RemoteCommDev *comdev = NULL;
+ Error *err = NULL;
+
+ dev = qdev_find_recursive(sysbus_get_default(), o->devid);
+ if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ error_report("%s is not a PCI device", o->devid);
+ return;
+ }
+
+ ioc = qio_channel_new_fd(o->fd, &err);
+ if (!ioc) {
+ error_report_err(err);
+ return;
+ }
+ qio_channel_set_blocking(ioc, false, NULL);
+
+ o->dev = dev;
+
+ o->listener.unrealize = remote_object_unrealize_listener;
+ device_listener_register(&o->listener);
+
+ /* co-routine should free this. */
+ comdev = g_new0(RemoteCommDev, 1);
+ *comdev = (RemoteCommDev) {
+ .ioc = ioc,
+ .dev = PCI_DEVICE(dev),
+ };
+
+ co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
+ qemu_coroutine_enter(co);
+}
+
+static void remote_object_init(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ if (k->nr_devs >= k->max_devs) {
+ error_report("Reached maximum number of devices: %u", k->max_devs);
+ return;
+ }
+
+ o->ioc = NULL;
+ o->fd = -1;
+ o->devid = NULL;
+
+ k->nr_devs++;
+
+ o->machine_done.notify = remote_object_machine_done;
+ qemu_add_machine_init_done_notifier(&o->machine_done);
+}
+
+static void remote_object_finalize(Object *obj)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
+ RemoteObject *o = REMOTE_OBJECT(obj);
+
+ device_listener_unregister(&o->listener);
+
+ if (o->ioc) {
+ qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ qio_channel_close(o->ioc, NULL);
+ }
+
+ object_unref(OBJECT(o->ioc));
+
+ k->nr_devs--;
+ g_free(o->devid);
+}
+
+static void remote_object_class_init(ObjectClass *klass, void *data)
+{
+ RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
+
+ /*
+ * Limit number of supported devices to 1. This is done to avoid devices
+ * from one VM accessing the RAM of another VM. This is done until we
+ * start using separate address spaces for individual devices.
+ */
+ k->max_devs = 1;
+ k->nr_devs = 0;
+
+ object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
+ object_class_property_add_str(klass, "devid", NULL,
+ remote_object_set_devid);
+}
+
+static const TypeInfo remote_object_info = {
+ .name = TYPE_REMOTE_OBJECT,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(RemoteObject),
+ .instance_init = remote_object_init,
+ .instance_finalize = remote_object_finalize,
+ .class_size = sizeof(RemoteObjectClass),
+ .class_init = remote_object_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&remote_object_info);
+}
+
+type_init(register_types);
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
new file mode 100644
index 00000000..c167b3c7
--- /dev/null
+++ b/hw/remote/trace-events
@@ -0,0 +1,15 @@
+# multi-process trace events
+
+mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
+mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
+
+# vfio-user-obj.c
+vfu_prop(const char *prop, const char *val) "vfu: setting %s as %s"
+vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u -> 0x%x"
+vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u <- 0x%x"
+vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes"
+vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64""
+vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64""
+vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64""
+vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64""
+vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d"
diff --git a/hw/remote/trace.h b/hw/remote/trace.h
new file mode 100644
index 00000000..5d5e3ac7
--- /dev/null
+++ b/hw/remote/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_remote.h"
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
new file mode 100644
index 00000000..4e36bb8b
--- /dev/null
+++ b/hw/remote/vfio-user-obj.c
@@ -0,0 +1,951 @@
+/**
+ * QEMU vfio-user-server server object
+ *
+ * Copyright © 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
+ *
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/**
+ * Usage: add options:
+ * -machine x-remote,vfio-user=on,auto-shutdown=on
+ * -device <PCI-device>,id=<pci-dev-id>
+ * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
+ * device=<pci-dev-id>
+ *
+ * Note that x-vfio-user-server object must be used with x-remote machine only.
+ * This server could only support PCI devices for now.
+ *
+ * type - SocketAddress type - presently "unix" alone is supported. Required
+ * option
+ *
+ * path - named unix socket, it will be created by the server. It is
+ * a required option
+ *
+ * device - id of a device on the server, a required option. PCI devices
+ * alone are supported presently.
+ *
+ * notes - x-vfio-user-server could block IO and monitor during the
+ * initialization phase.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qom/object.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "sysemu/runstate.h"
+#include "hw/boards.h"
+#include "hw/remote/machine.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-sockets.h"
+#include "qapi/qapi-events-misc.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "sysemu/sysemu.h"
+#include "libvfio-user.h"
+#include "hw/qdev-core.h"
+#include "hw/pci/pci.h"
+#include "qemu/timer.h"
+#include "exec/memory.h"
+#include "hw/pci/msi.h"
+#include "hw/pci/msix.h"
+#include "hw/remote/vfio-user-obj.h"
+
+#define TYPE_VFU_OBJECT "x-vfio-user-server"
+OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
+
+/**
+ * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown
+ * is set, it aborts the machine on error. Otherwise, it logs an
+ * error message without aborting.
+ */
+#define VFU_OBJECT_ERROR(o, fmt, ...) \
+ { \
+ if (vfu_object_auto_shutdown()) { \
+ error_setg(&error_abort, (fmt), ## __VA_ARGS__); \
+ } else { \
+ error_report((fmt), ## __VA_ARGS__); \
+ } \
+ } \
+
+struct VfuObjectClass {
+ ObjectClass parent_class;
+
+ unsigned int nr_devs;
+};
+
+struct VfuObject {
+ /* private */
+ Object parent;
+
+ SocketAddress *socket;
+
+ char *device;
+
+ Error *err;
+
+ Notifier machine_done;
+
+ vfu_ctx_t *vfu_ctx;
+
+ PCIDevice *pci_dev;
+
+ Error *unplug_blocker;
+
+ int vfu_poll_fd;
+
+ MSITriggerFunc *default_msi_trigger;
+ MSIPrepareMessageFunc *default_msi_prepare_message;
+ MSIxPrepareMessageFunc *default_msix_prepare_message;
+};
+
+static void vfu_object_init_ctx(VfuObject *o, Error **errp);
+
+static bool vfu_object_auto_shutdown(void)
+{
+ bool auto_shutdown = true;
+ Error *local_err = NULL;
+
+ if (!current_machine) {
+ return auto_shutdown;
+ }
+
+ auto_shutdown = object_property_get_bool(OBJECT(current_machine),
+ "auto-shutdown",
+ &local_err);
+
+ /*
+ * local_err would be set if no such property exists - safe to ignore.
+ * Unlikely scenario as auto-shutdown is always defined for
+ * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with
+ * TYPE_REMOTE_MACHINE
+ */
+ if (local_err) {
+ auto_shutdown = true;
+ error_free(local_err);
+ }
+
+ return auto_shutdown;
+}
+
+static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ VfuObject *o = VFU_OBJECT(obj);
+
+ if (o->vfu_ctx) {
+ error_setg(errp, "vfu: Unable to set socket property - server busy");
+ return;
+ }
+
+ qapi_free_SocketAddress(o->socket);
+
+ o->socket = NULL;
+
+ visit_type_SocketAddress(v, name, &o->socket, errp);
+
+ if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
+ error_setg(errp, "vfu: Unsupported socket type - %s",
+ SocketAddressType_str(o->socket->type));
+ qapi_free_SocketAddress(o->socket);
+ o->socket = NULL;
+ return;
+ }
+
+ trace_vfu_prop("socket", o->socket->u.q_unix.path);
+
+ vfu_object_init_ctx(o, errp);
+}
+
+static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
+{
+ VfuObject *o = VFU_OBJECT(obj);
+
+ if (o->vfu_ctx) {
+ error_setg(errp, "vfu: Unable to set device property - server busy");
+ return;
+ }
+
+ g_free(o->device);
+
+ o->device = g_strdup(str);
+
+ trace_vfu_prop("device", str);
+
+ vfu_object_init_ctx(o, errp);
+}
+
+static void vfu_object_ctx_run(void *opaque)
+{
+ VfuObject *o = opaque;
+ const char *vfu_id;
+ char *vfu_path, *pci_dev_path;
+ int ret = -1;
+
+ while (ret != 0) {
+ ret = vfu_run_ctx(o->vfu_ctx);
+ if (ret < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else if (errno == ENOTCONN) {
+ vfu_id = object_get_canonical_path_component(OBJECT(o));
+ vfu_path = object_get_canonical_path(OBJECT(o));
+ g_assert(o->pci_dev);
+ pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
+ /* o->device is a required property and is non-NULL here */
+ g_assert(o->device);
+ qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
+ o->device, pci_dev_path);
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+ o->vfu_poll_fd = -1;
+ object_unparent(OBJECT(o));
+ g_free(vfu_path);
+ g_free(pci_dev_path);
+ break;
+ } else {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
+ o->device, strerror(errno));
+ break;
+ }
+ }
+ }
+}
+
+static void vfu_object_attach_ctx(void *opaque)
+{
+ VfuObject *o = opaque;
+ GPollFD pfds[1];
+ int ret;
+
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+
+ pfds[0].fd = o->vfu_poll_fd;
+ pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
+
+retry_attach:
+ ret = vfu_attach_ctx(o->vfu_ctx);
+ if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+ /**
+ * vfu_object_attach_ctx can block QEMU's main loop
+ * during attach - the monitor and other IO
+ * could be unresponsive during this time.
+ */
+ (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
+ goto retry_attach;
+ } else if (ret < 0) {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
+ o->device, strerror(errno));
+ return;
+ }
+
+ o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+ if (o->vfu_poll_fd < 0) {
+ VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
+ return;
+ }
+
+ qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
+}
+
+static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
+ size_t count, loff_t offset,
+ const bool is_write)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ uint32_t pci_access_width = sizeof(uint32_t);
+ size_t bytes = count;
+ uint32_t val = 0;
+ char *ptr = buf;
+ int len;
+
+ /*
+ * Writes to the BAR registers would trigger an update to the
+ * global Memory and IO AddressSpaces. But the remote device
+ * never uses the global AddressSpaces, therefore overlapping
+ * memory regions are not a problem
+ */
+ while (bytes > 0) {
+ len = (bytes > pci_access_width) ? pci_access_width : bytes;
+ if (is_write) {
+ memcpy(&val, ptr, len);
+ pci_host_config_write_common(o->pci_dev, offset,
+ pci_config_size(o->pci_dev),
+ val, len);
+ trace_vfu_cfg_write(offset, val);
+ } else {
+ val = pci_host_config_read_common(o->pci_dev, offset,
+ pci_config_size(o->pci_dev), len);
+ memcpy(ptr, &val, len);
+ trace_vfu_cfg_read(offset, val);
+ }
+ offset += len;
+ ptr += len;
+ bytes -= len;
+ }
+
+ return count;
+}
+
+static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ AddressSpace *dma_as = NULL;
+ MemoryRegion *subregion = NULL;
+ g_autofree char *name = NULL;
+ struct iovec *iov = &info->iova;
+
+ if (!info->vaddr) {
+ return;
+ }
+
+ name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
+ (uint64_t)info->vaddr);
+
+ subregion = g_new0(MemoryRegion, 1);
+
+ memory_region_init_ram_ptr(subregion, NULL, name,
+ iov->iov_len, info->vaddr);
+
+ dma_as = pci_device_iommu_address_space(o->pci_dev);
+
+ memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
+
+ trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
+}
+
+static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ AddressSpace *dma_as = NULL;
+ MemoryRegion *mr = NULL;
+ ram_addr_t offset;
+
+ mr = memory_region_from_host(info->vaddr, &offset);
+ if (!mr) {
+ return;
+ }
+
+ dma_as = pci_device_iommu_address_space(o->pci_dev);
+
+ memory_region_del_subregion(dma_as->root, mr);
+
+ object_unparent((OBJECT(mr)));
+
+ trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
+}
+
+static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
+ hwaddr size, const bool is_write)
+{
+ uint8_t *ptr = buf;
+ bool release_lock = false;
+ uint8_t *ram_ptr = NULL;
+ MemTxResult result;
+ int access_size;
+ uint64_t val;
+
+ if (memory_access_is_direct(mr, is_write)) {
+ /**
+ * Some devices expose a PCI expansion ROM, which could be buffer
+ * based as compared to other regions which are primarily based on
+ * MemoryRegionOps. memory_region_find() would already check
+ * for buffer overflow, we don't need to repeat it here.
+ */
+ ram_ptr = memory_region_get_ram_ptr(mr);
+
+ if (is_write) {
+ memcpy((ram_ptr + offset), buf, size);
+ } else {
+ memcpy(buf, (ram_ptr + offset), size);
+ }
+
+ return 0;
+ }
+
+ while (size) {
+ /**
+ * The read/write logic used below is similar to the ones in
+ * flatview_read/write_continue()
+ */
+ release_lock = prepare_mmio_access(mr);
+
+ access_size = memory_access_size(mr, size, offset);
+
+ if (is_write) {
+ val = ldn_he_p(ptr, access_size);
+
+ result = memory_region_dispatch_write(mr, offset, val,
+ size_memop(access_size),
+ MEMTXATTRS_UNSPECIFIED);
+ } else {
+ result = memory_region_dispatch_read(mr, offset, &val,
+ size_memop(access_size),
+ MEMTXATTRS_UNSPECIFIED);
+
+ stn_he_p(ptr, access_size, val);
+ }
+
+ if (release_lock) {
+ qemu_mutex_unlock_iothread();
+ release_lock = false;
+ }
+
+ if (result != MEMTX_OK) {
+ return -1;
+ }
+
+ size -= access_size;
+ ptr += access_size;
+ offset += access_size;
+ }
+
+ return 0;
+}
+
+static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
+ hwaddr bar_offset, char * const buf,
+ hwaddr len, const bool is_write)
+{
+ MemoryRegionSection section = { 0 };
+ uint8_t *ptr = (uint8_t *)buf;
+ MemoryRegion *section_mr = NULL;
+ uint64_t section_size;
+ hwaddr section_offset;
+ hwaddr size = 0;
+
+ while (len) {
+ section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
+ bar_offset, len);
+
+ if (!section.mr) {
+ warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
+ return size;
+ }
+
+ section_mr = section.mr;
+ section_offset = section.offset_within_region;
+ section_size = int128_get64(section.size);
+
+ if (is_write && section_mr->readonly) {
+ warn_report("vfu: attempting to write to readonly region in "
+ "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
+ pci_bar, bar_offset,
+ (bar_offset + section_size));
+ memory_region_unref(section_mr);
+ return size;
+ }
+
+ if (vfu_object_mr_rw(section_mr, ptr, section_offset,
+ section_size, is_write)) {
+ warn_report("vfu: failed to %s "
+ "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
+ is_write ? "write to" : "read from", bar_offset,
+ (bar_offset + section_size), pci_bar);
+ memory_region_unref(section_mr);
+ return size;
+ }
+
+ size += section_size;
+ bar_offset += section_size;
+ ptr += section_size;
+ len -= section_size;
+
+ memory_region_unref(section_mr);
+ }
+
+ return size;
+}
+
+/**
+ * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
+ *
+ * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
+ * define vfu_object_bar2_handler
+ */
+#define VFU_OBJECT_BAR_HANDLER(BAR_NO) \
+ static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \
+ char * const buf, size_t count, \
+ loff_t offset, const bool is_write) \
+ { \
+ VfuObject *o = vfu_get_private(vfu_ctx); \
+ PCIDevice *pci_dev = o->pci_dev; \
+ \
+ return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \
+ buf, count, is_write); \
+ } \
+
+VFU_OBJECT_BAR_HANDLER(0)
+VFU_OBJECT_BAR_HANDLER(1)
+VFU_OBJECT_BAR_HANDLER(2)
+VFU_OBJECT_BAR_HANDLER(3)
+VFU_OBJECT_BAR_HANDLER(4)
+VFU_OBJECT_BAR_HANDLER(5)
+VFU_OBJECT_BAR_HANDLER(6)
+
+static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
+ &vfu_object_bar0_handler,
+ &vfu_object_bar1_handler,
+ &vfu_object_bar2_handler,
+ &vfu_object_bar3_handler,
+ &vfu_object_bar4_handler,
+ &vfu_object_bar5_handler,
+ &vfu_object_bar6_handler,
+};
+
+/**
+ * vfu_object_register_bars - Identify active BAR regions of pdev and setup
+ * callbacks to handle read/write accesses
+ */
+static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
+{
+ int flags = VFU_REGION_FLAG_RW;
+ int i;
+
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
+ if (!pdev->io_regions[i].size) {
+ continue;
+ }
+
+ if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
+ pdev->io_regions[i].memory->readonly) {
+ flags &= ~VFU_REGION_FLAG_WRITE;
+ }
+
+ vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
+ (size_t)pdev->io_regions[i].size,
+ vfu_object_bar_handlers[i],
+ flags, NULL, 0, -1, 0);
+
+ trace_vfu_bar_register(i, pdev->io_regions[i].addr,
+ pdev->io_regions[i].size);
+ }
+}
+
+static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
+{
+ int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
+ pci_dev->devfn);
+
+ return pci_bdf;
+}
+
+static void vfu_object_set_irq(void *opaque, int pirq, int level)
+{
+ PCIBus *pci_bus = opaque;
+ PCIDevice *pci_dev = NULL;
+ vfu_ctx_t *vfu_ctx = NULL;
+ int pci_bus_num, devfn;
+
+ if (level) {
+ pci_bus_num = PCI_BUS_NUM(pirq);
+ devfn = PCI_BDF_TO_DEVFN(pirq);
+
+ /*
+ * pci_find_device() performs at O(1) if the device is attached
+ * to the root PCI bus. Whereas, if the device is attached to a
+ * secondary PCI bus (such as when a root port is involved),
+ * finding the parent PCI bus could take O(n)
+ */
+ pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
+
+ vfu_ctx = pci_dev->irq_opaque;
+
+ g_assert(vfu_ctx);
+
+ vfu_irq_trigger(vfu_ctx, 0);
+ }
+}
+
+static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
+ unsigned int vector)
+{
+ MSIMessage msg;
+
+ msg.address = 0;
+ msg.data = vector;
+
+ return msg;
+}
+
+static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
+{
+ vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
+
+ vfu_irq_trigger(vfu_ctx, msg.data);
+}
+
+static void vfu_object_setup_msi_cbs(VfuObject *o)
+{
+ o->default_msi_trigger = o->pci_dev->msi_trigger;
+ o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
+ o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
+
+ o->pci_dev->msi_trigger = vfu_object_msi_trigger;
+ o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
+ o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
+}
+
+static void vfu_object_restore_msi_cbs(VfuObject *o)
+{
+ o->pci_dev->msi_trigger = o->default_msi_trigger;
+ o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
+ o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
+}
+
+static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
+ uint32_t count, bool mask)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ uint32_t vector;
+
+ for (vector = start; vector < count; vector++) {
+ msix_set_mask(o->pci_dev, vector, mask);
+ }
+}
+
+static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
+ uint32_t count, bool mask)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+ Error *err = NULL;
+ uint32_t vector;
+
+ for (vector = start; vector < count; vector++) {
+ msi_set_mask(o->pci_dev, vector, mask, &err);
+ if (err) {
+ VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
+ error_get_pretty(err));
+ error_free(err);
+ err = NULL;
+ }
+ }
+}
+
+static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
+{
+ vfu_ctx_t *vfu_ctx = o->vfu_ctx;
+ int ret;
+
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (msix_nr_vectors_allocated(pci_dev)) {
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
+ msix_nr_vectors_allocated(pci_dev));
+ vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
+ &vfu_msix_irq_state);
+ } else if (msi_nr_vectors_allocated(pci_dev)) {
+ ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
+ msi_nr_vectors_allocated(pci_dev));
+ vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
+ &vfu_msi_irq_state);
+ }
+
+ if (ret < 0) {
+ return ret;
+ }
+
+ vfu_object_setup_msi_cbs(o);
+
+ pci_dev->irq_opaque = vfu_ctx;
+
+ return 0;
+}
+
+void vfu_object_set_bus_irq(PCIBus *pci_bus)
+{
+ int bus_num = pci_bus_num(pci_bus);
+ int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
+
+ pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
+ max_bdf);
+}
+
+static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
+{
+ VfuObject *o = vfu_get_private(vfu_ctx);
+
+ /* vfu_object_ctx_run() handles lost connection */
+ if (type == VFU_RESET_LOST_CONN) {
+ return 0;
+ }
+
+ qdev_reset_all(DEVICE(o->pci_dev));
+
+ return 0;
+}
+
+/*
+ * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
+ * properties. It also depends on devices instantiated in QEMU. These
+ * dependencies are not available during the instance_init phase of this
+ * object's life-cycle. As such, the server is initialized after the
+ * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
+ * when the machine is setup, and the dependencies are available.
+ */
+static void vfu_object_machine_done(Notifier *notifier, void *data)
+{
+ VfuObject *o = container_of(notifier, VfuObject, machine_done);
+ Error *err = NULL;
+
+ vfu_object_init_ctx(o, &err);
+
+ if (err) {
+ error_propagate(&error_abort, err);
+ }
+}
+
+/**
+ * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
+ * an unplug blocker for the associated PCI device. Setup a FD handler
+ * to process incoming messages in the context's socket.
+ *
+ * The socket and device properties are mandatory, and this function
+ * will not create the context without them - the setters for these
+ * properties should call this function when the property is set. The
+ * machine should also be ready when this function is invoked - it is
+ * because QEMU objects are initialized before devices, and the
+ * associated PCI device wouldn't be available at the object
+ * initialization time. Until these conditions are satisfied, this
+ * function would return early without performing any task.
+ */
+static void vfu_object_init_ctx(VfuObject *o, Error **errp)
+{
+ ERRP_GUARD();
+ DeviceState *dev = NULL;
+ vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
+ int ret;
+
+ if (o->vfu_ctx || !o->socket || !o->device ||
+ !phase_check(PHASE_MACHINE_READY)) {
+ return;
+ }
+
+ if (o->err) {
+ error_propagate(errp, o->err);
+ o->err = NULL;
+ return;
+ }
+
+ o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
+ LIBVFIO_USER_FLAG_ATTACH_NB,
+ o, VFU_DEV_TYPE_PCI);
+ if (o->vfu_ctx == NULL) {
+ error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
+ return;
+ }
+
+ dev = qdev_find_recursive(sysbus_get_default(), o->device);
+ if (dev == NULL) {
+ error_setg(errp, "vfu: Device %s not found", o->device);
+ goto fail;
+ }
+
+ if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+ error_setg(errp, "vfu: %s not a PCI device", o->device);
+ goto fail;
+ }
+
+ o->pci_dev = PCI_DEVICE(dev);
+
+ object_ref(OBJECT(o->pci_dev));
+
+ if (pci_is_express(o->pci_dev)) {
+ pci_type = VFU_PCI_TYPE_EXPRESS;
+ }
+
+ ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
+ if (ret < 0) {
+ error_setg(errp,
+ "vfu: Failed to attach PCI device %s to context - %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ error_setg(&o->unplug_blocker,
+ "vfu: %s for %s must be deleted before unplugging",
+ TYPE_VFU_OBJECT, o->device);
+ qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+
+ ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
+ pci_config_size(o->pci_dev), &vfu_object_cfg_access,
+ VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
+ NULL, 0, -1, 0);
+ if (ret < 0) {
+ error_setg(errp,
+ "vfu: Failed to setup config space handlers for %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
+ o->device);
+ goto fail;
+ }
+
+ vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
+
+ ret = vfu_object_setup_irqs(o, o->pci_dev);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup interrupts for %s",
+ o->device);
+ goto fail;
+ }
+
+ ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to setup reset callback");
+ goto fail;
+ }
+
+ ret = vfu_realize_ctx(o->vfu_ctx);
+ if (ret < 0) {
+ error_setg(errp, "vfu: Failed to realize device %s- %s",
+ o->device, strerror(errno));
+ goto fail;
+ }
+
+ o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
+ if (o->vfu_poll_fd < 0) {
+ error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
+ goto fail;
+ }
+
+ qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
+
+ return;
+
+fail:
+ vfu_destroy_ctx(o->vfu_ctx);
+ if (o->unplug_blocker && o->pci_dev) {
+ qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+ error_free(o->unplug_blocker);
+ o->unplug_blocker = NULL;
+ }
+ if (o->pci_dev) {
+ vfu_object_restore_msi_cbs(o);
+ o->pci_dev->irq_opaque = NULL;
+ object_unref(OBJECT(o->pci_dev));
+ o->pci_dev = NULL;
+ }
+ o->vfu_ctx = NULL;
+}
+
+static void vfu_object_init(Object *obj)
+{
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
+ VfuObject *o = VFU_OBJECT(obj);
+
+ k->nr_devs++;
+
+ if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
+ error_setg(&o->err, "vfu: %s only compatible with %s machine",
+ TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
+ return;
+ }
+
+ if (!phase_check(PHASE_MACHINE_READY)) {
+ o->machine_done.notify = vfu_object_machine_done;
+ qemu_add_machine_init_done_notifier(&o->machine_done);
+ }
+
+ o->vfu_poll_fd = -1;
+}
+
+static void vfu_object_finalize(Object *obj)
+{
+ VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
+ VfuObject *o = VFU_OBJECT(obj);
+
+ k->nr_devs--;
+
+ qapi_free_SocketAddress(o->socket);
+
+ o->socket = NULL;
+
+ if (o->vfu_poll_fd != -1) {
+ qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
+ o->vfu_poll_fd = -1;
+ }
+
+ if (o->vfu_ctx) {
+ vfu_destroy_ctx(o->vfu_ctx);
+ o->vfu_ctx = NULL;
+ }
+
+ g_free(o->device);
+
+ o->device = NULL;
+
+ if (o->unplug_blocker && o->pci_dev) {
+ qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
+ error_free(o->unplug_blocker);
+ o->unplug_blocker = NULL;
+ }
+
+ if (o->pci_dev) {
+ vfu_object_restore_msi_cbs(o);
+ o->pci_dev->irq_opaque = NULL;
+ object_unref(OBJECT(o->pci_dev));
+ o->pci_dev = NULL;
+ }
+
+ if (!k->nr_devs && vfu_object_auto_shutdown()) {
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
+ }
+
+ if (o->machine_done.notify) {
+ qemu_remove_machine_init_done_notifier(&o->machine_done);
+ o->machine_done.notify = NULL;
+ }
+}
+
+static void vfu_object_class_init(ObjectClass *klass, void *data)
+{
+ VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
+
+ k->nr_devs = 0;
+
+ object_class_property_add(klass, "socket", "SocketAddress", NULL,
+ vfu_object_set_socket, NULL, NULL);
+ object_class_property_set_description(klass, "socket",
+ "SocketAddress "
+ "(ex: type=unix,path=/tmp/sock). "
+ "Only UNIX is presently supported");
+ object_class_property_add_str(klass, "device", NULL,
+ vfu_object_set_device);
+ object_class_property_set_description(klass, "device",
+ "device ID - only PCI devices "
+ "are presently supported");
+}
+
+static const TypeInfo vfu_object_info = {
+ .name = TYPE_VFU_OBJECT,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(VfuObject),
+ .instance_init = vfu_object_init,
+ .instance_finalize = vfu_object_finalize,
+ .class_size = sizeof(VfuObjectClass),
+ .class_init = vfu_object_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void vfu_register_types(void)
+{
+ type_register_static(&vfu_object_info);
+}
+
+type_init(vfu_register_types);