diff options
Diffstat (limited to 'hw/remote')
-rw-r--r-- | hw/remote/Kconfig | 8 | ||||
-rw-r--r-- | hw/remote/iohub.c | 117 | ||||
-rw-r--r-- | hw/remote/iommu.c | 131 | ||||
-rw-r--r-- | hw/remote/machine.c | 160 | ||||
-rw-r--r-- | hw/remote/memory.c | 62 | ||||
-rw-r--r-- | hw/remote/meson.build | 17 | ||||
-rw-r--r-- | hw/remote/message.c | 229 | ||||
-rw-r--r-- | hw/remote/mpqemu-link.c | 263 | ||||
-rw-r--r-- | hw/remote/proxy-memory-listener.c | 226 | ||||
-rw-r--r-- | hw/remote/proxy.c | 386 | ||||
-rw-r--r-- | hw/remote/remote-obj.c | 202 | ||||
-rw-r--r-- | hw/remote/trace-events | 15 | ||||
-rw-r--r-- | hw/remote/trace.h | 1 | ||||
-rw-r--r-- | hw/remote/vfio-user-obj.c | 951 |
14 files changed, 2768 insertions, 0 deletions
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig new file mode 100644 index 00000000..2d6b4f4c --- /dev/null +++ b/hw/remote/Kconfig @@ -0,0 +1,8 @@ +config MULTIPROCESS + bool + depends on PCI && PCI_EXPRESS && KVM + select REMOTE_PCIHOST + +config VFIO_USER_SERVER + bool + depends on MULTIPROCESS diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c new file mode 100644 index 00000000..40dfee4b --- /dev/null +++ b/hw/remote/iohub.c @@ -0,0 +1,117 @@ +/* + * Remote IO Hub + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/pci/pci.h" +#include "hw/pci/pci_ids.h" +#include "hw/pci/pci_bus.h" +#include "qemu/thread.h" +#include "hw/remote/machine.h" +#include "hw/remote/iohub.h" +#include "qemu/main-loop.h" + +void remote_iohub_init(RemoteIOHubState *iohub) +{ + int pirq; + + memset(&iohub->irqfds, 0, sizeof(iohub->irqfds)); + memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds)); + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_mutex_init(&iohub->irq_level_lock[pirq]); + iohub->irq_level[pirq] = 0; + event_notifier_init_fd(&iohub->irqfds[pirq], -1); + event_notifier_init_fd(&iohub->resamplefds[pirq], -1); + } +} + +void remote_iohub_finalize(RemoteIOHubState *iohub) +{ + int pirq; + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + qemu_mutex_destroy(&iohub->irq_level_lock[pirq]); + } +} + +int remote_iohub_map_irq(PCIDevice *pci_dev, int intx) +{ + return pci_dev->devfn; +} + +void remote_iohub_set_irq(void *opaque, int pirq, int level) +{ + RemoteIOHubState *iohub = opaque; + + assert(pirq >= 0); + assert(pirq < PCI_DEVFN_MAX); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (level) { + if (++iohub->irq_level[pirq] == 1) { + event_notifier_set(&iohub->irqfds[pirq]); + } + } else if (iohub->irq_level[pirq] > 0) { + iohub->irq_level[pirq]--; + } +} + +static void intr_resample_handler(void *opaque) +{ + ResampleToken *token = opaque; + RemoteIOHubState *iohub = token->iohub; + int pirq, s; + + pirq = token->pirq; + + s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]); + + assert(s >= 0); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (iohub->irq_level[pirq]) { + event_notifier_set(&iohub->irqfds[pirq]); + } +} + +void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg) +{ + RemoteMachineState *machine = REMOTE_MACHINE(current_machine); + RemoteIOHubState *iohub = &machine->iohub; + int pirq, intx; + + intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + pirq = remote_iohub_map_irq(pci_dev, intx); + + if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + memset(&iohub->token[pirq], 0, sizeof(ResampleToken)); + } + + event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]); + event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]); + + iohub->token[pirq].iohub = iohub; + iohub->token[pirq].pirq = pirq; + + qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL, + &iohub->token[pirq]); +} diff --git a/hw/remote/iommu.c b/hw/remote/iommu.c new file mode 100644 index 00000000..1391dd71 --- /dev/null +++ b/hw/remote/iommu.c @@ -0,0 +1,131 @@ +/** + * IOMMU for remote device + * + * Copyright © 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/iommu.h" +#include "hw/pci/pci_bus.h" +#include "hw/pci/pci.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" +#include "trace.h" + +/** + * IOMMU for TYPE_REMOTE_MACHINE - manages DMA address space isolation + * for remote machine. It is used by TYPE_VFIO_USER_SERVER. + * + * - Each TYPE_VFIO_USER_SERVER instance handles one PCIDevice on a PCIBus. + * There is one RemoteIommu per PCIBus, so the RemoteIommu tracks multiple + * PCIDevices by maintaining a ->elem_by_devfn mapping. + * + * - memory_region_init_iommu() is not used because vfio-user MemoryRegions + * will be added to the elem->mr container instead. This is more natural + * than implementing the IOMMUMemoryRegionClass APIs since vfio-user + * provides something that is close to a full-fledged MemoryRegion and + * not like an IOMMU mapping. + * + * - When a device is hot unplugged, the elem->mr reference is dropped so + * all vfio-user MemoryRegions associated with this vfio-user server are + * destroyed. + */ + +static AddressSpace *remote_iommu_find_add_as(PCIBus *pci_bus, + void *opaque, int devfn) +{ + RemoteIommu *iommu = opaque; + RemoteIommuElem *elem = NULL; + + qemu_mutex_lock(&iommu->lock); + + elem = g_hash_table_lookup(iommu->elem_by_devfn, INT2VOIDP(devfn)); + + if (!elem) { + elem = g_new0(RemoteIommuElem, 1); + g_hash_table_insert(iommu->elem_by_devfn, INT2VOIDP(devfn), elem); + } + + if (!elem->mr) { + elem->mr = MEMORY_REGION(object_new(TYPE_MEMORY_REGION)); + memory_region_set_size(elem->mr, UINT64_MAX); + address_space_init(&elem->as, elem->mr, NULL); + } + + qemu_mutex_unlock(&iommu->lock); + + return &elem->as; +} + +void remote_iommu_unplug_dev(PCIDevice *pci_dev) +{ + AddressSpace *as = pci_device_iommu_address_space(pci_dev); + RemoteIommuElem *elem = NULL; + + if (as == &address_space_memory) { + return; + } + + elem = container_of(as, RemoteIommuElem, as); + + address_space_destroy(&elem->as); + + object_unref(elem->mr); + + elem->mr = NULL; +} + +static void remote_iommu_init(Object *obj) +{ + RemoteIommu *iommu = REMOTE_IOMMU(obj); + + iommu->elem_by_devfn = g_hash_table_new_full(NULL, NULL, NULL, g_free); + + qemu_mutex_init(&iommu->lock); +} + +static void remote_iommu_finalize(Object *obj) +{ + RemoteIommu *iommu = REMOTE_IOMMU(obj); + + qemu_mutex_destroy(&iommu->lock); + + g_hash_table_destroy(iommu->elem_by_devfn); + + iommu->elem_by_devfn = NULL; +} + +void remote_iommu_setup(PCIBus *pci_bus) +{ + RemoteIommu *iommu = NULL; + + g_assert(pci_bus); + + iommu = REMOTE_IOMMU(object_new(TYPE_REMOTE_IOMMU)); + + pci_setup_iommu(pci_bus, remote_iommu_find_add_as, iommu); + + object_property_add_child(OBJECT(pci_bus), "remote-iommu", OBJECT(iommu)); + + object_unref(OBJECT(iommu)); +} + +static const TypeInfo remote_iommu_info = { + .name = TYPE_REMOTE_IOMMU, + .parent = TYPE_OBJECT, + .instance_size = sizeof(RemoteIommu), + .instance_init = remote_iommu_init, + .instance_finalize = remote_iommu_finalize, +}; + +static void remote_iommu_register_types(void) +{ + type_register_static(&remote_iommu_info); +} + +type_init(remote_iommu_register_types) diff --git a/hw/remote/machine.c b/hw/remote/machine.c new file mode 100644 index 00000000..75d550da --- /dev/null +++ b/hw/remote/machine.c @@ -0,0 +1,160 @@ +/* + * Machine for remote device + * + * This machine type is used by the remote device process in multi-process + * QEMU. QEMU device models depend on parent busses, interrupt controllers, + * memory regions, etc. The remote machine type offers this environment so + * that QEMU device models can be used as remote devices. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/machine.h" +#include "exec/memory.h" +#include "qapi/error.h" +#include "hw/pci/pci_host.h" +#include "hw/remote/iohub.h" +#include "hw/remote/iommu.h" +#include "hw/qdev-core.h" +#include "hw/remote/iommu.h" +#include "hw/remote/vfio-user-obj.h" +#include "hw/pci/msi.h" + +static void remote_machine_init(MachineState *machine) +{ + MemoryRegion *system_memory, *system_io, *pci_memory; + RemoteMachineState *s = REMOTE_MACHINE(machine); + RemotePCIHost *rem_host; + PCIHostState *pci_host; + + system_memory = get_system_memory(); + system_io = get_system_io(); + + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + + rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST)); + + rem_host->mr_pci_mem = pci_memory; + rem_host->mr_sys_mem = system_memory; + rem_host->mr_sys_io = system_io; + + s->host = rem_host; + + object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host)); + memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1); + + qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal); + + pci_host = PCI_HOST_BRIDGE(rem_host); + + if (s->vfio_user) { + remote_iommu_setup(pci_host->bus); + + msi_nonbroken = true; + + vfu_object_set_bus_irq(pci_host->bus); + } else { + remote_iohub_init(&s->iohub); + + pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq, + &s->iohub, REMOTE_IOHUB_NB_PIRQS); + } + + qbus_set_hotplug_handler(BUS(pci_host->bus), OBJECT(s)); +} + +static bool remote_machine_get_vfio_user(Object *obj, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + return s->vfio_user; +} + +static void remote_machine_set_vfio_user(Object *obj, bool value, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + if (phase_check(PHASE_MACHINE_CREATED)) { + error_setg(errp, "Error enabling vfio-user - machine already created"); + return; + } + + s->vfio_user = value; +} + +static bool remote_machine_get_auto_shutdown(Object *obj, Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + return s->auto_shutdown; +} + +static void remote_machine_set_auto_shutdown(Object *obj, bool value, + Error **errp) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + s->auto_shutdown = value; +} + +static void remote_machine_instance_init(Object *obj) +{ + RemoteMachineState *s = REMOTE_MACHINE(obj); + + s->auto_shutdown = true; +} + +static void remote_machine_dev_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + qdev_unrealize(dev); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + remote_iommu_unplug_dev(PCI_DEVICE(dev)); + } +} + +static void remote_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + + mc->init = remote_machine_init; + mc->desc = "Experimental remote machine"; + + hc->unplug = remote_machine_dev_unplug_cb; + + object_class_property_add_bool(oc, "vfio-user", + remote_machine_get_vfio_user, + remote_machine_set_vfio_user); + + object_class_property_add_bool(oc, "auto-shutdown", + remote_machine_get_auto_shutdown, + remote_machine_set_auto_shutdown); +} + +static const TypeInfo remote_machine = { + .name = TYPE_REMOTE_MACHINE, + .parent = TYPE_MACHINE, + .instance_size = sizeof(RemoteMachineState), + .instance_init = remote_machine_instance_init, + .class_init = remote_machine_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + } +}; + +static void remote_machine_register_types(void) +{ + type_register_static(&remote_machine); +} + +type_init(remote_machine_register_types); diff --git a/hw/remote/memory.c b/hw/remote/memory.c new file mode 100644 index 00000000..6d60da91 --- /dev/null +++ b/hw/remote/memory.c @@ -0,0 +1,62 @@ +/* + * Memory manager for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/memory.h" +#include "exec/ram_addr.h" +#include "qapi/error.h" + +static void remote_sysmem_reset(void) +{ + MemoryRegion *sysmem, *subregion, *next; + + sysmem = get_system_memory(); + + QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) { + if (subregion->ram) { + memory_region_del_subregion(sysmem, subregion); + object_unparent(OBJECT(subregion)); + } + } +} + +void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem; + MemoryRegion *sysmem, *subregion; + static unsigned int suffix; + int region; + + sysmem = get_system_memory(); + + remote_sysmem_reset(); + + for (region = 0; region < msg->num_fds; region++, suffix++) { + g_autofree char *name = g_strdup_printf("remote-mem-%u", suffix); + subregion = g_new(MemoryRegion, 1); + memory_region_init_ram_from_fd(subregion, NULL, + name, sysmem_info->sizes[region], + RAM_SHARED, msg->fds[region], + sysmem_info->offsets[region], + errp); + + if (*errp) { + g_free(subregion); + remote_sysmem_reset(); + return; + } + + memory_region_add_subregion(sysmem, sysmem_info->gpas[region], + subregion); + + } +} diff --git a/hw/remote/meson.build b/hw/remote/meson.build new file mode 100644 index 00000000..ab25c049 --- /dev/null +++ b/hw/remote/meson.build @@ -0,0 +1,17 @@ +remote_ss = ss.source_set() + +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iommu.c')) +remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: files('vfio-user-obj.c')) + +remote_ss.add(when: 'CONFIG_VFIO_USER_SERVER', if_true: libvfio_user_dep) + +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c')) +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c')) + +softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss) diff --git a/hw/remote/message.c b/hw/remote/message.c new file mode 100644 index 00000000..50f6bf2d --- /dev/null +++ b/hw/remote/message.c @@ -0,0 +1,229 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/machine.h" +#include "io/channel.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "sysemu/runstate.h" +#include "hw/pci/pci.h" +#include "exec/memattrs.h" +#include "hw/remote/memory.h" +#include "hw/remote/iohub.h" +#include "sysemu/reset.h" + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp); + +void coroutine_fn mpqemu_remote_msg_loop_co(void *data) +{ + g_autofree RemoteCommDev *com = (RemoteCommDev *)data; + PCIDevice *pci_dev = NULL; + Error *local_err = NULL; + + assert(com->ioc); + + pci_dev = com->dev; + for (; !local_err;) { + MPQemuMsg msg = {0}; + + if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) { + break; + } + + if (!mpqemu_msg_valid(&msg)) { + error_setg(&local_err, "Received invalid message from proxy" + "in remote process pid="FMT_pid"", + getpid()); + break; + } + + switch (msg.cmd) { + case MPQEMU_CMD_PCI_CFGWRITE: + process_config_write(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_PCI_CFGREAD: + process_config_read(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_WRITE: + process_bar_write(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_READ: + process_bar_read(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_SYNC_SYSMEM: + remote_sysmem_reconfig(&msg, &local_err); + break; + case MPQEMU_CMD_SET_IRQFD: + process_set_irqfd_msg(pci_dev, &msg); + break; + case MPQEMU_CMD_DEVICE_RESET: + process_device_reset_msg(com->ioc, pci_dev, &local_err); + break; + default: + error_setg(&local_err, + "Unknown command (%d) received for device %s" + " (pid="FMT_pid")", + msg.cmd, DEVICE(pci_dev)->id, getpid()); + } + } + + if (local_err) { + error_report_err(local_err); + qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR); + } else { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } +} + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + pci_default_write_config(dev, conf->addr, conf->val, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + AddressSpace *as = + bar_access->memory ? &address_space_memory : &address_space_io; + MPQemuMsg ret = { 0 }; + MemTxResult res; + uint64_t val; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + ret.data.u64 = UINT64_MAX; + goto fail; + } + + val = cpu_to_le64(bar_access->val); + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, true); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".", + bar_access->addr, getpid()); + ret.data.u64 = -1; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + MPQemuMsg ret = { 0 }; + AddressSpace *as; + MemTxResult res; + uint64_t val = 0; + + as = bar_access->memory ? &address_space_memory : &address_space_io; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + val = UINT64_MAX; + goto fail; + } + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, false); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".", + bar_access->addr, getpid()); + val = UINT64_MAX; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.data.u64 = le64_to_cpu(val); + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp) +{ + DeviceClass *dc = DEVICE_GET_CLASS(dev); + DeviceState *s = DEVICE(dev); + MPQemuMsg ret = { 0 }; + + if (dc->reset) { + dc->reset(s); + } + + ret.cmd = MPQEMU_CMD_RET; + + mpqemu_msg_send(&ret, ioc, errp); +} diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c new file mode 100644 index 00000000..9bd98e82 --- /dev/null +++ b/hw/remote/mpqemu-link.c @@ -0,0 +1,263 @@ +/* + * Communication channel between QEMU and remote device process + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "qemu/module.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "qemu/iov.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "io/channel.h" +#include "sysemu/iothread.h" +#include "trace.h" + +/* + * Send message over the ioc QIOChannel. + * This function is safe to call from: + * - main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - IOThread within co-routine context, outside of co-routine context + * will block IOThread; + * Returns true if no errors were encountered, false otherwise. + */ +bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + struct iovec send[2] = {}; + int *fds = NULL; + size_t nfds = 0; + bool ret = false; + + send[0].iov_base = msg; + send[0].iov_len = MPQEMU_MSG_HDR_SIZE; + + send[1].iov_base = (void *)&msg->data; + send[1].iov_len = msg->size; + + if (msg->num_fds) { + nfds = msg->num_fds; + fds = msg->fds; + } + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + /* + * Skip unlocking/locking iothread lock when the IOThread is running + * in co-routine context. Co-routine context is asserted above + * for IOThread case. + * Also skip lock handling while in a co-routine in the main context. + */ + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send), + fds, nfds, 0, errp)) { + ret = true; + } else { + trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds); + } + + if (iolock && !iothread && !qemu_in_coroutine()) { + /* See above comment why skip locking here. */ + qemu_mutex_lock_iothread(); + } + + return ret; +} + +/* + * Read message from the ioc QIOChannel. + * This function is safe to call from: + * - From main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - From vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - From IOThread within co-routine context, outside of co-routine context + * will block IOThread; + */ +static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds, + size_t *nfds, Error **errp) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + int ret = -1; + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_lock_iothread(); + } + + return (ret <= 0) ? ret : iov.iov_len; +} + +bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + ERRP_GUARD(); + g_autofree int *fds = NULL; + size_t nfds = 0; + ssize_t len; + bool ret = false; + + len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp); + if (len <= 0) { + goto fail; + } else if (len != MPQEMU_MSG_HDR_SIZE) { + error_setg(errp, "Message header corrupted"); + goto fail; + } + + if (msg->size > sizeof(msg->data)) { + error_setg(errp, "Invalid size for message"); + goto fail; + } + + if (!msg->size) { + goto copy_fds; + } + + len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp); + if (len <= 0) { + goto fail; + } + if (len != msg->size) { + error_setg(errp, "Unable to read full message"); + goto fail; + } + +copy_fds: + msg->num_fds = nfds; + if (nfds > G_N_ELEMENTS(msg->fds)) { + error_setg(errp, + "Overflow error: received %zu fds, more than max of %d fds", + nfds, REMOTE_MAX_FDS); + goto fail; + } + if (nfds) { + memcpy(msg->fds, fds, nfds * sizeof(int)); + } + + ret = true; + +fail: + if (*errp) { + trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds); + } + while (*errp && nfds) { + close(fds[nfds - 1]); + nfds--; + } + + return ret; +} + +/* + * Send msg and wait for a reply with command code RET_MSG. + * Returns the message received of size u64 or UINT64_MAX + * on error. + * Called from VCPU thread in non-coroutine context. + * Used by the Proxy object to communicate to remote processes. + */ +uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev, + Error **errp) +{ + MPQemuMsg msg_reply = {0}; + uint64_t ret = UINT64_MAX; + + assert(!qemu_in_coroutine()); + + QEMU_LOCK_GUARD(&pdev->io_mutex); + if (!mpqemu_msg_send(msg, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) { + error_setg(errp, "ERROR: Invalid reply received for command %d", + msg->cmd); + return ret; + } + + return msg_reply.data.u64; +} + +bool mpqemu_msg_valid(MPQemuMsg *msg) +{ + if (msg->cmd >= MPQEMU_CMD_MAX || msg->cmd < 0) { + return false; + } + + /* Verify FDs. */ + if (msg->num_fds >= REMOTE_MAX_FDS) { + return false; + } + + if (msg->num_fds > 0) { + for (int i = 0; i < msg->num_fds; i++) { + if (fcntl(msg->fds[i], F_GETFL) == -1) { + return false; + } + } + } + + /* Verify message specific fields. */ + switch (msg->cmd) { + case MPQEMU_CMD_SYNC_SYSMEM: + if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) { + return false; + } + break; + case MPQEMU_CMD_PCI_CFGWRITE: + case MPQEMU_CMD_PCI_CFGREAD: + if (msg->size != sizeof(PciConfDataMsg)) { + return false; + } + break; + case MPQEMU_CMD_BAR_WRITE: + case MPQEMU_CMD_BAR_READ: + if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) { + return false; + } + break; + case MPQEMU_CMD_SET_IRQFD: + if (msg->size || (msg->num_fds != 2)) { + return false; + } + break; + default: + break; + } + + return true; +} diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c new file mode 100644 index 00000000..eb9918fe --- /dev/null +++ b/hw/remote/proxy-memory-listener.c @@ -0,0 +1,226 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "qemu/compiler.h" +#include "qemu/int128.h" +#include "qemu/range.h" +#include "exec/memory.h" +#include "exec/cpu-common.h" +#include "exec/ram_addr.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "hw/remote/mpqemu-link.h" +#include "hw/remote/proxy-memory-listener.h" + +/* + * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and + * proxy_memory_listener_commit() defined below perform tasks similar to the + * functions defined in vhost-user.c. These functions are good candidates + * for refactoring. + * + */ + +static void proxy_memory_listener_reset(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + int mrs; + + for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) { + memory_region_unref(proxy_listener->mr_sections[mrs].mr); + } + + g_free(proxy_listener->mr_sections); + proxy_listener->mr_sections = NULL; + proxy_listener->n_mr_sections = 0; +} + +static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset) +{ + MemoryRegion *mr; + ram_addr_t off; + + /** + * Assumes that the host address is a valid address as it's + * coming from the MemoryListener system. In the case host + * address is not valid, the following call would return + * the default subregion of "system_memory" region, and + * not NULL. So it's not possible to check for NULL here. + */ + mr = memory_region_from_host((void *)(uintptr_t)host, &off); + + if (offset) { + *offset = off; + } + + return memory_region_get_fd(mr); +} + +static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size) +{ + if (((prev_host + size) != host)) { + return false; + } + + if (get_fd_from_hostaddr(host, NULL) != + get_fd_from_hostaddr(prev_host, NULL)) { + return false; + } + + return true; +} + +static bool try_merge(ProxyMemoryListener *proxy_listener, + MemoryRegionSection *section) +{ + uint64_t mrs_size, mrs_gpa, mrs_page; + MemoryRegionSection *prev_sec; + bool merged = false; + uintptr_t mrs_host; + RAMBlock *mrs_rb; + + if (!proxy_listener->n_mr_sections) { + return false; + } + + mrs_rb = section->mr->ram_block; + mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb); + mrs_size = int128_get64(section->size); + mrs_gpa = section->offset_within_address_space; + mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + + if (get_fd_from_hostaddr(mrs_host, NULL) < 0) { + return true; + } + + mrs_host = mrs_host & ~(mrs_page - 1); + mrs_gpa = mrs_gpa & ~(mrs_page - 1); + mrs_size = ROUND_UP(mrs_size, mrs_page); + + prev_sec = proxy_listener->mr_sections + + (proxy_listener->n_mr_sections - 1); + uint64_t prev_gpa_start = prev_sec->offset_within_address_space; + uint64_t prev_size = int128_get64(prev_sec->size); + uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); + uint64_t prev_host_start = + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + + prev_sec->offset_within_region; + uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); + + if (mrs_gpa <= (prev_gpa_end + 1)) { + g_assert(mrs_gpa > prev_gpa_start); + + if ((section->mr == prev_sec->mr) && + proxy_mrs_can_merge(mrs_host, prev_host_start, + (mrs_gpa - prev_gpa_start))) { + uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); + merged = true; + prev_sec->offset_within_address_space = + MIN(prev_gpa_start, mrs_gpa); + prev_sec->offset_within_region = + MIN(prev_host_start, mrs_host) - + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); + prev_sec->size = int128_make64(max_end - MIN(prev_host_start, + mrs_host)); + } + } + + return merged; +} + +static void proxy_memory_listener_region_addnop(MemoryListener *listener, + MemoryRegionSection *section) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + + if (!memory_region_is_ram(section->mr) || + memory_region_is_rom(section->mr)) { + return; + } + + if (try_merge(proxy_listener, section)) { + return; + } + + ++proxy_listener->n_mr_sections; + proxy_listener->mr_sections = g_renew(MemoryRegionSection, + proxy_listener->mr_sections, + proxy_listener->n_mr_sections); + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section; + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL; + memory_region_ref(section->mr); +} + +static void proxy_memory_listener_commit(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + MPQemuMsg msg; + MemoryRegionSection *section; + ram_addr_t offset; + uintptr_t host_addr; + int region; + Error *local_err = NULL; + + memset(&msg, 0, sizeof(MPQemuMsg)); + + msg.cmd = MPQEMU_CMD_SYNC_SYSMEM; + msg.num_fds = proxy_listener->n_mr_sections; + msg.size = sizeof(SyncSysmemMsg); + if (msg.num_fds > REMOTE_MAX_FDS) { + error_report("Number of fds is more than %d", REMOTE_MAX_FDS); + return; + } + + for (region = 0; region < proxy_listener->n_mr_sections; region++) { + section = &proxy_listener->mr_sections[region]; + msg.data.sync_sysmem.gpas[region] = + section->offset_within_address_space; + msg.data.sync_sysmem.sizes[region] = int128_get64(section->size); + host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset); + msg.data.sync_sysmem.offsets[region] = offset; + } + if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) { + error_report_err(local_err); + } +} + +void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener) +{ + memory_listener_unregister(&proxy_listener->listener); + + proxy_memory_listener_reset(&proxy_listener->listener); +} + +void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener, + QIOChannel *ioc) +{ + proxy_listener->n_mr_sections = 0; + proxy_listener->mr_sections = NULL; + + proxy_listener->ioc = ioc; + + proxy_listener->listener.begin = proxy_memory_listener_reset; + proxy_listener->listener.commit = proxy_memory_listener_commit; + proxy_listener->listener.region_add = proxy_memory_listener_region_addnop; + proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop; + proxy_listener->listener.priority = 10; + proxy_listener->listener.name = "proxy"; + + memory_listener_register(&proxy_listener->listener, + &address_space_memory); +} diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c new file mode 100644 index 00000000..1c7786b5 --- /dev/null +++ b/hw/remote/proxy.c @@ -0,0 +1,386 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "hw/remote/proxy.h" +#include "hw/pci/pci.h" +#include "qapi/error.h" +#include "io/channel-util.h" +#include "hw/qdev-properties.h" +#include "monitor/monitor.h" +#include "migration/blocker.h" +#include "qemu/sockets.h" +#include "hw/remote/mpqemu-link.h" +#include "qemu/error-report.h" +#include "hw/remote/proxy-memory-listener.h" +#include "qom/object.h" +#include "qemu/event_notifier.h" +#include "sysemu/kvm.h" +#include "util/event_notifier-posix.c" + +static void probe_pci_info(PCIDevice *dev, Error **errp); +static void proxy_device_reset(DeviceState *dev); + +static void proxy_intx_update(PCIDevice *pci_dev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev); + PCIINTxRoute route; + int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + if (dev->virq != -1) { + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq); + dev->virq = -1; + } + + route = pci_device_route_intx_to_irq(pci_dev, pin); + + dev->virq = route.irq; + + if (dev->virq != -1) { + kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr, + &dev->resample, dev->virq); + } +} + +static void setup_irqfd(PCIProxyDev *dev) +{ + PCIDevice *pci_dev = PCI_DEVICE(dev); + MPQemuMsg msg; + Error *local_err = NULL; + + event_notifier_init(&dev->intr, 0); + event_notifier_init(&dev->resample, 0); + + memset(&msg, 0, sizeof(MPQemuMsg)); + msg.cmd = MPQEMU_CMD_SET_IRQFD; + msg.num_fds = 2; + msg.fds[0] = event_notifier_get_fd(&dev->intr); + msg.fds[1] = event_notifier_get_fd(&dev->resample); + msg.size = 0; + + if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) { + error_report_err(local_err); + } + + dev->virq = -1; + + proxy_intx_update(pci_dev); + + pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update); +} + +static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) +{ + ERRP_GUARD(); + PCIProxyDev *dev = PCI_PROXY_DEV(device); + uint8_t *pci_conf = device->config; + int fd; + + if (!dev->fd) { + error_setg(errp, "fd parameter not specified for %s", + DEVICE(device)->id); + return; + } + + fd = monitor_fd_param(monitor_cur(), dev->fd, errp); + if (fd == -1) { + error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "proxy: fd %d is not a socket", fd); + close(fd); + return; + } + + dev->ioc = qio_channel_new_fd(fd, errp); + if (!dev->ioc) { + close(fd); + return; + } + + error_setg(&dev->migration_blocker, "%s does not support migration", + TYPE_PCI_PROXY_DEV); + if (migrate_add_blocker(dev->migration_blocker, errp) < 0) { + error_free(dev->migration_blocker); + object_unref(dev->ioc); + return; + } + + qemu_mutex_init(&dev->io_mutex); + qio_channel_set_blocking(dev->ioc, true, NULL); + + pci_conf[PCI_LATENCY_TIMER] = 0xff; + pci_conf[PCI_INTERRUPT_PIN] = 0x01; + + proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc); + + setup_irqfd(dev); + + probe_pci_info(PCI_DEVICE(dev), errp); +} + +static void pci_proxy_dev_exit(PCIDevice *pdev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pdev); + + if (dev->ioc) { + qio_channel_close(dev->ioc, NULL); + } + + migrate_del_blocker(dev->migration_blocker); + + error_free(dev->migration_blocker); + + proxy_memory_listener_deconfigure(&dev->proxy_listener); + + event_notifier_cleanup(&dev->intr); + event_notifier_cleanup(&dev->resample); +} + +static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val, + int len, unsigned int op) +{ + MPQemuMsg msg = { 0 }; + uint64_t ret = -EINVAL; + Error *local_err = NULL; + + msg.cmd = op; + msg.data.pci_conf_data.addr = addr; + msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0; + msg.data.pci_conf_data.len = len; + msg.size = sizeof(PciConfDataMsg); + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (ret == UINT64_MAX) { + error_report("Failed to perform PCI config %s operation", + (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE"); + } + + if (op == MPQEMU_CMD_PCI_CFGREAD) { + *val = (uint32_t)ret; + } +} + +static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len) +{ + uint32_t val; + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD); + + return val; +} + +static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val, + int len) +{ + /* + * Some of the functions access the copy of remote device's PCI config + * space which is cached in the proxy device. Therefore, maintain + * it updated. + */ + pci_default_write_config(d, addr, val, len); + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE); +} + +static Property proxy_properties[] = { + DEFINE_PROP_STRING("fd", PCIProxyDev, fd), + DEFINE_PROP_END_OF_LIST(), +}; + +static void pci_proxy_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = pci_proxy_dev_realize; + k->exit = pci_proxy_dev_exit; + k->config_read = pci_proxy_read_config; + k->config_write = pci_proxy_write_config; + + dc->reset = proxy_device_reset; + + device_class_set_props(dc, proxy_properties); +} + +static const TypeInfo pci_proxy_dev_type_info = { + .name = TYPE_PCI_PROXY_DEV, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PCIProxyDev), + .class_init = pci_proxy_dev_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void pci_proxy_dev_register_types(void) +{ + type_register_static(&pci_proxy_dev_type_info); +} + +type_init(pci_proxy_dev_register_types) + +static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr, + bool write, hwaddr addr, uint64_t *val, + unsigned size, bool memory) +{ + MPQemuMsg msg = { 0 }; + long ret = -EINVAL; + Error *local_err = NULL; + + msg.size = sizeof(BarAccessMsg); + msg.data.bar_access.addr = mr->addr + addr; + msg.data.bar_access.size = size; + msg.data.bar_access.memory = memory; + + if (write) { + msg.cmd = MPQEMU_CMD_BAR_WRITE; + msg.data.bar_access.val = *val; + } else { + msg.cmd = MPQEMU_CMD_BAR_READ; + } + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (!write) { + *val = ret; + } +} + +static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + + send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size, + pmr->memory); +} + +static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + uint64_t val; + + send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size, + pmr->memory); + + return val; +} + +const MemoryRegionOps proxy_mr_ops = { + .read = proxy_bar_read, + .write = proxy_bar_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static void probe_pci_info(PCIDevice *dev, Error **errp) +{ + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + uint32_t orig_val, new_val, base_class, val; + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + DeviceClass *dc = DEVICE_CLASS(pc); + uint8_t type; + int i, size; + + config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->vendor_id = (uint16_t)val; + + config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->device_id = (uint16_t)val; + + config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->class_id = (uint16_t)val; + + config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->subsystem_id = (uint16_t)val; + + base_class = pc->class_id >> 4; + switch (base_class) { + case PCI_BASE_CLASS_BRIDGE: + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + break; + case PCI_BASE_CLASS_STORAGE: + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + break; + case PCI_BASE_CLASS_NETWORK: + case PCI_BASE_CLASS_WIRELESS: + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + break; + case PCI_BASE_CLASS_INPUT: + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + break; + case PCI_BASE_CLASS_DISPLAY: + set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories); + break; + case PCI_BASE_CLASS_PROCESSOR: + set_bit(DEVICE_CATEGORY_CPU, dc->categories); + break; + default: + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + break; + } + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + new_val = 0xffffffff; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + size = (~(new_val & 0xFFFFFFF0)) + 1; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + type = (new_val & 0x1) ? + PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY; + + if (size) { + g_autofree char *name = g_strdup_printf("bar-region-%d", i); + pdev->region[i].dev = pdev; + pdev->region[i].present = true; + if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) { + pdev->region[i].memory = true; + } + memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev), + &proxy_mr_ops, &pdev->region[i], + name, size); + pci_register_bar(dev, i, type, &pdev->region[i].mr); + } + } +} + +static void proxy_device_reset(DeviceState *dev) +{ + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + MPQemuMsg msg = { 0 }; + Error *local_err = NULL; + + msg.cmd = MPQEMU_CMD_DEVICE_RESET; + msg.size = 0; + + mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + +} diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c new file mode 100644 index 00000000..333e5ac4 --- /dev/null +++ b/hw/remote/remote-obj.c @@ -0,0 +1,202 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "qemu/error-report.h" +#include "qemu/notify.h" +#include "qom/object_interfaces.h" +#include "hw/qdev-core.h" +#include "io/channel.h" +#include "hw/qdev-core.h" +#include "hw/remote/machine.h" +#include "io/channel-util.h" +#include "qapi/error.h" +#include "sysemu/sysemu.h" +#include "hw/pci/pci.h" +#include "qemu/sockets.h" +#include "monitor/monitor.h" + +#define TYPE_REMOTE_OBJECT "x-remote-object" +OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT) + +struct RemoteObjectClass { + ObjectClass parent_class; + + unsigned int nr_devs; + unsigned int max_devs; +}; + +struct RemoteObject { + /* private */ + Object parent; + + Notifier machine_done; + + int32_t fd; + char *devid; + + QIOChannel *ioc; + + DeviceState *dev; + DeviceListener listener; +}; + +static void remote_object_set_fd(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "File descriptor '%s' is not a socket", str); + close(fd); + return; + } + + o->fd = fd; +} + +static void remote_object_set_devid(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + + g_free(o->devid); + + o->devid = g_strdup(str); +} + +static void remote_object_unrealize_listener(DeviceListener *listener, + DeviceState *dev) +{ + RemoteObject *o = container_of(listener, RemoteObject, listener); + + if (o->dev == dev) { + object_unref(OBJECT(o)); + } +} + +static void remote_object_machine_done(Notifier *notifier, void *data) +{ + RemoteObject *o = container_of(notifier, RemoteObject, machine_done); + DeviceState *dev = NULL; + QIOChannel *ioc = NULL; + Coroutine *co = NULL; + RemoteCommDev *comdev = NULL; + Error *err = NULL; + + dev = qdev_find_recursive(sysbus_get_default(), o->devid); + if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + error_report("%s is not a PCI device", o->devid); + return; + } + + ioc = qio_channel_new_fd(o->fd, &err); + if (!ioc) { + error_report_err(err); + return; + } + qio_channel_set_blocking(ioc, false, NULL); + + o->dev = dev; + + o->listener.unrealize = remote_object_unrealize_listener; + device_listener_register(&o->listener); + + /* co-routine should free this. */ + comdev = g_new0(RemoteCommDev, 1); + *comdev = (RemoteCommDev) { + .ioc = ioc, + .dev = PCI_DEVICE(dev), + }; + + co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev); + qemu_coroutine_enter(co); +} + +static void remote_object_init(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + if (k->nr_devs >= k->max_devs) { + error_report("Reached maximum number of devices: %u", k->max_devs); + return; + } + + o->ioc = NULL; + o->fd = -1; + o->devid = NULL; + + k->nr_devs++; + + o->machine_done.notify = remote_object_machine_done; + qemu_add_machine_init_done_notifier(&o->machine_done); +} + +static void remote_object_finalize(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + device_listener_unregister(&o->listener); + + if (o->ioc) { + qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + qio_channel_close(o->ioc, NULL); + } + + object_unref(OBJECT(o->ioc)); + + k->nr_devs--; + g_free(o->devid); +} + +static void remote_object_class_init(ObjectClass *klass, void *data) +{ + RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass); + + /* + * Limit number of supported devices to 1. This is done to avoid devices + * from one VM accessing the RAM of another VM. This is done until we + * start using separate address spaces for individual devices. + */ + k->max_devs = 1; + k->nr_devs = 0; + + object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd); + object_class_property_add_str(klass, "devid", NULL, + remote_object_set_devid); +} + +static const TypeInfo remote_object_info = { + .name = TYPE_REMOTE_OBJECT, + .parent = TYPE_OBJECT, + .instance_size = sizeof(RemoteObject), + .instance_init = remote_object_init, + .instance_finalize = remote_object_finalize, + .class_size = sizeof(RemoteObjectClass), + .class_init = remote_object_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&remote_object_info); +} + +type_init(register_types); diff --git a/hw/remote/trace-events b/hw/remote/trace-events new file mode 100644 index 00000000..c167b3c7 --- /dev/null +++ b/hw/remote/trace-events @@ -0,0 +1,15 @@ +# multi-process trace events + +mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process" +mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process" + +# vfio-user-obj.c +vfu_prop(const char *prop, const char *val) "vfu: setting %s as %s" +vfu_cfg_read(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u -> 0x%x" +vfu_cfg_write(uint32_t offset, uint32_t val) "vfu: cfg: 0x%u <- 0x%x" +vfu_dma_register(uint64_t gpa, size_t len) "vfu: registering GPA 0x%"PRIx64", %zu bytes" +vfu_dma_unregister(uint64_t gpa) "vfu: unregistering GPA 0x%"PRIx64"" +vfu_bar_register(int i, uint64_t addr, uint64_t size) "vfu: BAR %d: addr 0x%"PRIx64" size 0x%"PRIx64"" +vfu_bar_rw_enter(const char *op, uint64_t addr) "vfu: %s request for BAR address 0x%"PRIx64"" +vfu_bar_rw_exit(const char *op, uint64_t addr) "vfu: Finished %s of BAR address 0x%"PRIx64"" +vfu_interrupt(int pirq) "vfu: sending interrupt to device - PIRQ %d" diff --git a/hw/remote/trace.h b/hw/remote/trace.h new file mode 100644 index 00000000..5d5e3ac7 --- /dev/null +++ b/hw/remote/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_remote.h" diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c new file mode 100644 index 00000000..4e36bb8b --- /dev/null +++ b/hw/remote/vfio-user-obj.c @@ -0,0 +1,951 @@ +/** + * QEMU vfio-user-server server object + * + * Copyright © 2022 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +/** + * Usage: add options: + * -machine x-remote,vfio-user=on,auto-shutdown=on + * -device <PCI-device>,id=<pci-dev-id> + * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>, + * device=<pci-dev-id> + * + * Note that x-vfio-user-server object must be used with x-remote machine only. + * This server could only support PCI devices for now. + * + * type - SocketAddress type - presently "unix" alone is supported. Required + * option + * + * path - named unix socket, it will be created by the server. It is + * a required option + * + * device - id of a device on the server, a required option. PCI devices + * alone are supported presently. + * + * notes - x-vfio-user-server could block IO and monitor during the + * initialization phase. + */ + +#include "qemu/osdep.h" + +#include "qom/object.h" +#include "qom/object_interfaces.h" +#include "qemu/error-report.h" +#include "trace.h" +#include "sysemu/runstate.h" +#include "hw/boards.h" +#include "hw/remote/machine.h" +#include "qapi/error.h" +#include "qapi/qapi-visit-sockets.h" +#include "qapi/qapi-events-misc.h" +#include "qemu/notify.h" +#include "qemu/thread.h" +#include "qemu/main-loop.h" +#include "sysemu/sysemu.h" +#include "libvfio-user.h" +#include "hw/qdev-core.h" +#include "hw/pci/pci.h" +#include "qemu/timer.h" +#include "exec/memory.h" +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "hw/remote/vfio-user-obj.h" + +#define TYPE_VFU_OBJECT "x-vfio-user-server" +OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT) + +/** + * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown + * is set, it aborts the machine on error. Otherwise, it logs an + * error message without aborting. + */ +#define VFU_OBJECT_ERROR(o, fmt, ...) \ + { \ + if (vfu_object_auto_shutdown()) { \ + error_setg(&error_abort, (fmt), ## __VA_ARGS__); \ + } else { \ + error_report((fmt), ## __VA_ARGS__); \ + } \ + } \ + +struct VfuObjectClass { + ObjectClass parent_class; + + unsigned int nr_devs; +}; + +struct VfuObject { + /* private */ + Object parent; + + SocketAddress *socket; + + char *device; + + Error *err; + + Notifier machine_done; + + vfu_ctx_t *vfu_ctx; + + PCIDevice *pci_dev; + + Error *unplug_blocker; + + int vfu_poll_fd; + + MSITriggerFunc *default_msi_trigger; + MSIPrepareMessageFunc *default_msi_prepare_message; + MSIxPrepareMessageFunc *default_msix_prepare_message; +}; + +static void vfu_object_init_ctx(VfuObject *o, Error **errp); + +static bool vfu_object_auto_shutdown(void) +{ + bool auto_shutdown = true; + Error *local_err = NULL; + + if (!current_machine) { + return auto_shutdown; + } + + auto_shutdown = object_property_get_bool(OBJECT(current_machine), + "auto-shutdown", + &local_err); + + /* + * local_err would be set if no such property exists - safe to ignore. + * Unlikely scenario as auto-shutdown is always defined for + * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with + * TYPE_REMOTE_MACHINE + */ + if (local_err) { + auto_shutdown = true; + error_free(local_err); + } + + return auto_shutdown; +} + +static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + VfuObject *o = VFU_OBJECT(obj); + + if (o->vfu_ctx) { + error_setg(errp, "vfu: Unable to set socket property - server busy"); + return; + } + + qapi_free_SocketAddress(o->socket); + + o->socket = NULL; + + visit_type_SocketAddress(v, name, &o->socket, errp); + + if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) { + error_setg(errp, "vfu: Unsupported socket type - %s", + SocketAddressType_str(o->socket->type)); + qapi_free_SocketAddress(o->socket); + o->socket = NULL; + return; + } + + trace_vfu_prop("socket", o->socket->u.q_unix.path); + + vfu_object_init_ctx(o, errp); +} + +static void vfu_object_set_device(Object *obj, const char *str, Error **errp) +{ + VfuObject *o = VFU_OBJECT(obj); + + if (o->vfu_ctx) { + error_setg(errp, "vfu: Unable to set device property - server busy"); + return; + } + + g_free(o->device); + + o->device = g_strdup(str); + + trace_vfu_prop("device", str); + + vfu_object_init_ctx(o, errp); +} + +static void vfu_object_ctx_run(void *opaque) +{ + VfuObject *o = opaque; + const char *vfu_id; + char *vfu_path, *pci_dev_path; + int ret = -1; + + while (ret != 0) { + ret = vfu_run_ctx(o->vfu_ctx); + if (ret < 0) { + if (errno == EINTR) { + continue; + } else if (errno == ENOTCONN) { + vfu_id = object_get_canonical_path_component(OBJECT(o)); + vfu_path = object_get_canonical_path(OBJECT(o)); + g_assert(o->pci_dev); + pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev)); + /* o->device is a required property and is non-NULL here */ + g_assert(o->device); + qapi_event_send_vfu_client_hangup(vfu_id, vfu_path, + o->device, pci_dev_path); + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + o->vfu_poll_fd = -1; + object_unparent(OBJECT(o)); + g_free(vfu_path); + g_free(pci_dev_path); + break; + } else { + VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s", + o->device, strerror(errno)); + break; + } + } + } +} + +static void vfu_object_attach_ctx(void *opaque) +{ + VfuObject *o = opaque; + GPollFD pfds[1]; + int ret; + + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + + pfds[0].fd = o->vfu_poll_fd; + pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR; + +retry_attach: + ret = vfu_attach_ctx(o->vfu_ctx); + if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + /** + * vfu_object_attach_ctx can block QEMU's main loop + * during attach - the monitor and other IO + * could be unresponsive during this time. + */ + (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS); + goto retry_attach; + } else if (ret < 0) { + VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s", + o->device, strerror(errno)); + return; + } + + o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx); + if (o->vfu_poll_fd < 0) { + VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device); + return; + } + + qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o); +} + +static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf, + size_t count, loff_t offset, + const bool is_write) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + uint32_t pci_access_width = sizeof(uint32_t); + size_t bytes = count; + uint32_t val = 0; + char *ptr = buf; + int len; + + /* + * Writes to the BAR registers would trigger an update to the + * global Memory and IO AddressSpaces. But the remote device + * never uses the global AddressSpaces, therefore overlapping + * memory regions are not a problem + */ + while (bytes > 0) { + len = (bytes > pci_access_width) ? pci_access_width : bytes; + if (is_write) { + memcpy(&val, ptr, len); + pci_host_config_write_common(o->pci_dev, offset, + pci_config_size(o->pci_dev), + val, len); + trace_vfu_cfg_write(offset, val); + } else { + val = pci_host_config_read_common(o->pci_dev, offset, + pci_config_size(o->pci_dev), len); + memcpy(ptr, &val, len); + trace_vfu_cfg_read(offset, val); + } + offset += len; + ptr += len; + bytes -= len; + } + + return count; +} + +static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + AddressSpace *dma_as = NULL; + MemoryRegion *subregion = NULL; + g_autofree char *name = NULL; + struct iovec *iov = &info->iova; + + if (!info->vaddr) { + return; + } + + name = g_strdup_printf("mem-%s-%"PRIx64"", o->device, + (uint64_t)info->vaddr); + + subregion = g_new0(MemoryRegion, 1); + + memory_region_init_ram_ptr(subregion, NULL, name, + iov->iov_len, info->vaddr); + + dma_as = pci_device_iommu_address_space(o->pci_dev); + + memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion); + + trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len); +} + +static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + AddressSpace *dma_as = NULL; + MemoryRegion *mr = NULL; + ram_addr_t offset; + + mr = memory_region_from_host(info->vaddr, &offset); + if (!mr) { + return; + } + + dma_as = pci_device_iommu_address_space(o->pci_dev); + + memory_region_del_subregion(dma_as->root, mr); + + object_unparent((OBJECT(mr))); + + trace_vfu_dma_unregister((uint64_t)info->iova.iov_base); +} + +static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset, + hwaddr size, const bool is_write) +{ + uint8_t *ptr = buf; + bool release_lock = false; + uint8_t *ram_ptr = NULL; + MemTxResult result; + int access_size; + uint64_t val; + + if (memory_access_is_direct(mr, is_write)) { + /** + * Some devices expose a PCI expansion ROM, which could be buffer + * based as compared to other regions which are primarily based on + * MemoryRegionOps. memory_region_find() would already check + * for buffer overflow, we don't need to repeat it here. + */ + ram_ptr = memory_region_get_ram_ptr(mr); + + if (is_write) { + memcpy((ram_ptr + offset), buf, size); + } else { + memcpy(buf, (ram_ptr + offset), size); + } + + return 0; + } + + while (size) { + /** + * The read/write logic used below is similar to the ones in + * flatview_read/write_continue() + */ + release_lock = prepare_mmio_access(mr); + + access_size = memory_access_size(mr, size, offset); + + if (is_write) { + val = ldn_he_p(ptr, access_size); + + result = memory_region_dispatch_write(mr, offset, val, + size_memop(access_size), + MEMTXATTRS_UNSPECIFIED); + } else { + result = memory_region_dispatch_read(mr, offset, &val, + size_memop(access_size), + MEMTXATTRS_UNSPECIFIED); + + stn_he_p(ptr, access_size, val); + } + + if (release_lock) { + qemu_mutex_unlock_iothread(); + release_lock = false; + } + + if (result != MEMTX_OK) { + return -1; + } + + size -= access_size; + ptr += access_size; + offset += access_size; + } + + return 0; +} + +static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar, + hwaddr bar_offset, char * const buf, + hwaddr len, const bool is_write) +{ + MemoryRegionSection section = { 0 }; + uint8_t *ptr = (uint8_t *)buf; + MemoryRegion *section_mr = NULL; + uint64_t section_size; + hwaddr section_offset; + hwaddr size = 0; + + while (len) { + section = memory_region_find(pci_dev->io_regions[pci_bar].memory, + bar_offset, len); + + if (!section.mr) { + warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset); + return size; + } + + section_mr = section.mr; + section_offset = section.offset_within_region; + section_size = int128_get64(section.size); + + if (is_write && section_mr->readonly) { + warn_report("vfu: attempting to write to readonly region in " + "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]", + pci_bar, bar_offset, + (bar_offset + section_size)); + memory_region_unref(section_mr); + return size; + } + + if (vfu_object_mr_rw(section_mr, ptr, section_offset, + section_size, is_write)) { + warn_report("vfu: failed to %s " + "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d", + is_write ? "write to" : "read from", bar_offset, + (bar_offset + section_size), pci_bar); + memory_region_unref(section_mr); + return size; + } + + size += section_size; + bar_offset += section_size; + ptr += section_size; + len -= section_size; + + memory_region_unref(section_mr); + } + + return size; +} + +/** + * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs. + * + * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would + * define vfu_object_bar2_handler + */ +#define VFU_OBJECT_BAR_HANDLER(BAR_NO) \ + static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \ + char * const buf, size_t count, \ + loff_t offset, const bool is_write) \ + { \ + VfuObject *o = vfu_get_private(vfu_ctx); \ + PCIDevice *pci_dev = o->pci_dev; \ + \ + return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \ + buf, count, is_write); \ + } \ + +VFU_OBJECT_BAR_HANDLER(0) +VFU_OBJECT_BAR_HANDLER(1) +VFU_OBJECT_BAR_HANDLER(2) +VFU_OBJECT_BAR_HANDLER(3) +VFU_OBJECT_BAR_HANDLER(4) +VFU_OBJECT_BAR_HANDLER(5) +VFU_OBJECT_BAR_HANDLER(6) + +static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = { + &vfu_object_bar0_handler, + &vfu_object_bar1_handler, + &vfu_object_bar2_handler, + &vfu_object_bar3_handler, + &vfu_object_bar4_handler, + &vfu_object_bar5_handler, + &vfu_object_bar6_handler, +}; + +/** + * vfu_object_register_bars - Identify active BAR regions of pdev and setup + * callbacks to handle read/write accesses + */ +static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev) +{ + int flags = VFU_REGION_FLAG_RW; + int i; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + if (!pdev->io_regions[i].size) { + continue; + } + + if ((i == VFU_PCI_DEV_ROM_REGION_IDX) || + pdev->io_regions[i].memory->readonly) { + flags &= ~VFU_REGION_FLAG_WRITE; + } + + vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i, + (size_t)pdev->io_regions[i].size, + vfu_object_bar_handlers[i], + flags, NULL, 0, -1, 0); + + trace_vfu_bar_register(i, pdev->io_regions[i].addr, + pdev->io_regions[i].size); + } +} + +static int vfu_object_map_irq(PCIDevice *pci_dev, int intx) +{ + int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)), + pci_dev->devfn); + + return pci_bdf; +} + +static void vfu_object_set_irq(void *opaque, int pirq, int level) +{ + PCIBus *pci_bus = opaque; + PCIDevice *pci_dev = NULL; + vfu_ctx_t *vfu_ctx = NULL; + int pci_bus_num, devfn; + + if (level) { + pci_bus_num = PCI_BUS_NUM(pirq); + devfn = PCI_BDF_TO_DEVFN(pirq); + + /* + * pci_find_device() performs at O(1) if the device is attached + * to the root PCI bus. Whereas, if the device is attached to a + * secondary PCI bus (such as when a root port is involved), + * finding the parent PCI bus could take O(n) + */ + pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn); + + vfu_ctx = pci_dev->irq_opaque; + + g_assert(vfu_ctx); + + vfu_irq_trigger(vfu_ctx, 0); + } +} + +static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev, + unsigned int vector) +{ + MSIMessage msg; + + msg.address = 0; + msg.data = vector; + + return msg; +} + +static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg) +{ + vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque; + + vfu_irq_trigger(vfu_ctx, msg.data); +} + +static void vfu_object_setup_msi_cbs(VfuObject *o) +{ + o->default_msi_trigger = o->pci_dev->msi_trigger; + o->default_msi_prepare_message = o->pci_dev->msi_prepare_message; + o->default_msix_prepare_message = o->pci_dev->msix_prepare_message; + + o->pci_dev->msi_trigger = vfu_object_msi_trigger; + o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg; + o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg; +} + +static void vfu_object_restore_msi_cbs(VfuObject *o) +{ + o->pci_dev->msi_trigger = o->default_msi_trigger; + o->pci_dev->msi_prepare_message = o->default_msi_prepare_message; + o->pci_dev->msix_prepare_message = o->default_msix_prepare_message; +} + +static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start, + uint32_t count, bool mask) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + uint32_t vector; + + for (vector = start; vector < count; vector++) { + msix_set_mask(o->pci_dev, vector, mask); + } +} + +static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start, + uint32_t count, bool mask) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + Error *err = NULL; + uint32_t vector; + + for (vector = start; vector < count; vector++) { + msi_set_mask(o->pci_dev, vector, mask, &err); + if (err) { + VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device, + error_get_pretty(err)); + error_free(err); + err = NULL; + } + } +} + +static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev) +{ + vfu_ctx_t *vfu_ctx = o->vfu_ctx; + int ret; + + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1); + if (ret < 0) { + return ret; + } + + if (msix_nr_vectors_allocated(pci_dev)) { + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ, + msix_nr_vectors_allocated(pci_dev)); + vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ, + &vfu_msix_irq_state); + } else if (msi_nr_vectors_allocated(pci_dev)) { + ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ, + msi_nr_vectors_allocated(pci_dev)); + vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ, + &vfu_msi_irq_state); + } + + if (ret < 0) { + return ret; + } + + vfu_object_setup_msi_cbs(o); + + pci_dev->irq_opaque = vfu_ctx; + + return 0; +} + +void vfu_object_set_bus_irq(PCIBus *pci_bus) +{ + int bus_num = pci_bus_num(pci_bus); + int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1); + + pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus, + max_bdf); +} + +static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type) +{ + VfuObject *o = vfu_get_private(vfu_ctx); + + /* vfu_object_ctx_run() handles lost connection */ + if (type == VFU_RESET_LOST_CONN) { + return 0; + } + + qdev_reset_all(DEVICE(o->pci_dev)); + + return 0; +} + +/* + * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device' + * properties. It also depends on devices instantiated in QEMU. These + * dependencies are not available during the instance_init phase of this + * object's life-cycle. As such, the server is initialized after the + * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT + * when the machine is setup, and the dependencies are available. + */ +static void vfu_object_machine_done(Notifier *notifier, void *data) +{ + VfuObject *o = container_of(notifier, VfuObject, machine_done); + Error *err = NULL; + + vfu_object_init_ctx(o, &err); + + if (err) { + error_propagate(&error_abort, err); + } +} + +/** + * vfu_object_init_ctx: Create and initialize libvfio-user context. Add + * an unplug blocker for the associated PCI device. Setup a FD handler + * to process incoming messages in the context's socket. + * + * The socket and device properties are mandatory, and this function + * will not create the context without them - the setters for these + * properties should call this function when the property is set. The + * machine should also be ready when this function is invoked - it is + * because QEMU objects are initialized before devices, and the + * associated PCI device wouldn't be available at the object + * initialization time. Until these conditions are satisfied, this + * function would return early without performing any task. + */ +static void vfu_object_init_ctx(VfuObject *o, Error **errp) +{ + ERRP_GUARD(); + DeviceState *dev = NULL; + vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL; + int ret; + + if (o->vfu_ctx || !o->socket || !o->device || + !phase_check(PHASE_MACHINE_READY)) { + return; + } + + if (o->err) { + error_propagate(errp, o->err); + o->err = NULL; + return; + } + + o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path, + LIBVFIO_USER_FLAG_ATTACH_NB, + o, VFU_DEV_TYPE_PCI); + if (o->vfu_ctx == NULL) { + error_setg(errp, "vfu: Failed to create context - %s", strerror(errno)); + return; + } + + dev = qdev_find_recursive(sysbus_get_default(), o->device); + if (dev == NULL) { + error_setg(errp, "vfu: Device %s not found", o->device); + goto fail; + } + + if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + error_setg(errp, "vfu: %s not a PCI device", o->device); + goto fail; + } + + o->pci_dev = PCI_DEVICE(dev); + + object_ref(OBJECT(o->pci_dev)); + + if (pci_is_express(o->pci_dev)) { + pci_type = VFU_PCI_TYPE_EXPRESS; + } + + ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0); + if (ret < 0) { + error_setg(errp, + "vfu: Failed to attach PCI device %s to context - %s", + o->device, strerror(errno)); + goto fail; + } + + error_setg(&o->unplug_blocker, + "vfu: %s for %s must be deleted before unplugging", + TYPE_VFU_OBJECT, o->device); + qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + + ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX, + pci_config_size(o->pci_dev), &vfu_object_cfg_access, + VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB, + NULL, 0, -1, 0); + if (ret < 0) { + error_setg(errp, + "vfu: Failed to setup config space handlers for %s- %s", + o->device, strerror(errno)); + goto fail; + } + + ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup DMA handlers for %s", + o->device); + goto fail; + } + + vfu_object_register_bars(o->vfu_ctx, o->pci_dev); + + ret = vfu_object_setup_irqs(o, o->pci_dev); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup interrupts for %s", + o->device); + goto fail; + } + + ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset); + if (ret < 0) { + error_setg(errp, "vfu: Failed to setup reset callback"); + goto fail; + } + + ret = vfu_realize_ctx(o->vfu_ctx); + if (ret < 0) { + error_setg(errp, "vfu: Failed to realize device %s- %s", + o->device, strerror(errno)); + goto fail; + } + + o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx); + if (o->vfu_poll_fd < 0) { + error_setg(errp, "vfu: Failed to get poll fd %s", o->device); + goto fail; + } + + qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o); + + return; + +fail: + vfu_destroy_ctx(o->vfu_ctx); + if (o->unplug_blocker && o->pci_dev) { + qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + error_free(o->unplug_blocker); + o->unplug_blocker = NULL; + } + if (o->pci_dev) { + vfu_object_restore_msi_cbs(o); + o->pci_dev->irq_opaque = NULL; + object_unref(OBJECT(o->pci_dev)); + o->pci_dev = NULL; + } + o->vfu_ctx = NULL; +} + +static void vfu_object_init(Object *obj) +{ + VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj); + VfuObject *o = VFU_OBJECT(obj); + + k->nr_devs++; + + if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) { + error_setg(&o->err, "vfu: %s only compatible with %s machine", + TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE); + return; + } + + if (!phase_check(PHASE_MACHINE_READY)) { + o->machine_done.notify = vfu_object_machine_done; + qemu_add_machine_init_done_notifier(&o->machine_done); + } + + o->vfu_poll_fd = -1; +} + +static void vfu_object_finalize(Object *obj) +{ + VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj); + VfuObject *o = VFU_OBJECT(obj); + + k->nr_devs--; + + qapi_free_SocketAddress(o->socket); + + o->socket = NULL; + + if (o->vfu_poll_fd != -1) { + qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL); + o->vfu_poll_fd = -1; + } + + if (o->vfu_ctx) { + vfu_destroy_ctx(o->vfu_ctx); + o->vfu_ctx = NULL; + } + + g_free(o->device); + + o->device = NULL; + + if (o->unplug_blocker && o->pci_dev) { + qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker); + error_free(o->unplug_blocker); + o->unplug_blocker = NULL; + } + + if (o->pci_dev) { + vfu_object_restore_msi_cbs(o); + o->pci_dev->irq_opaque = NULL; + object_unref(OBJECT(o->pci_dev)); + o->pci_dev = NULL; + } + + if (!k->nr_devs && vfu_object_auto_shutdown()) { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } + + if (o->machine_done.notify) { + qemu_remove_machine_init_done_notifier(&o->machine_done); + o->machine_done.notify = NULL; + } +} + +static void vfu_object_class_init(ObjectClass *klass, void *data) +{ + VfuObjectClass *k = VFU_OBJECT_CLASS(klass); + + k->nr_devs = 0; + + object_class_property_add(klass, "socket", "SocketAddress", NULL, + vfu_object_set_socket, NULL, NULL); + object_class_property_set_description(klass, "socket", + "SocketAddress " + "(ex: type=unix,path=/tmp/sock). " + "Only UNIX is presently supported"); + object_class_property_add_str(klass, "device", NULL, + vfu_object_set_device); + object_class_property_set_description(klass, "device", + "device ID - only PCI devices " + "are presently supported"); +} + +static const TypeInfo vfu_object_info = { + .name = TYPE_VFU_OBJECT, + .parent = TYPE_OBJECT, + .instance_size = sizeof(VfuObject), + .instance_init = vfu_object_init, + .instance_finalize = vfu_object_finalize, + .class_size = sizeof(VfuObjectClass), + .class_init = vfu_object_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void vfu_register_types(void) +{ + type_register_static(&vfu_object_info); +} + +type_init(vfu_register_types); |