diff options
Diffstat (limited to 'hw')
-rw-r--r-- | hw/Kconfig | 1 | ||||
-rw-r--r-- | hw/meson.build | 1 | ||||
-rw-r--r-- | hw/misc/ivshmem.c | 3 | ||||
-rw-r--r-- | hw/pci-host/Kconfig | 3 | ||||
-rw-r--r-- | hw/pci-host/meson.build | 1 | ||||
-rw-r--r-- | hw/pci-host/remote.c | 75 | ||||
-rw-r--r-- | hw/remote/Kconfig | 4 | ||||
-rw-r--r-- | hw/remote/iohub.c | 119 | ||||
-rw-r--r-- | hw/remote/machine.c | 80 | ||||
-rw-r--r-- | hw/remote/memory.c | 65 | ||||
-rw-r--r-- | hw/remote/meson.build | 13 | ||||
-rw-r--r-- | hw/remote/message.c | 230 | ||||
-rw-r--r-- | hw/remote/mpqemu-link.c | 267 | ||||
-rw-r--r-- | hw/remote/proxy-memory-listener.c | 227 | ||||
-rw-r--r-- | hw/remote/proxy.c | 379 | ||||
-rw-r--r-- | hw/remote/remote-obj.c | 203 | ||||
-rw-r--r-- | hw/remote/trace-events | 4 | ||||
-rw-r--r-- | hw/remote/trace.h | 1 |
18 files changed, 1675 insertions, 1 deletions
diff --git a/hw/Kconfig b/hw/Kconfig index d4cec9e476..8ea26479c4 100644 --- a/hw/Kconfig +++ b/hw/Kconfig @@ -27,6 +27,7 @@ source pci-host/Kconfig source pcmcia/Kconfig source pci/Kconfig source rdma/Kconfig +source remote/Kconfig source rtc/Kconfig source scsi/Kconfig source sd/Kconfig diff --git a/hw/meson.build b/hw/meson.build index 010de7219c..e615d72d4d 100644 --- a/hw/meson.build +++ b/hw/meson.build @@ -56,6 +56,7 @@ subdir('moxie') subdir('nios2') subdir('openrisc') subdir('ppc') +subdir('remote') subdir('riscv') subdir('rx') subdir('s390x') diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 0505b52c98..603e992a7f 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -495,7 +495,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp) /* mmap the region and map into the BAR2 */ memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s), - "ivshmem.bar2", size, true, fd, &local_err); + "ivshmem.bar2", size, true, fd, 0, + &local_err); if (local_err) { error_propagate(errp, local_err); return; diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig index eb03f0489d..8b8c763c28 100644 --- a/hw/pci-host/Kconfig +++ b/hw/pci-host/Kconfig @@ -65,3 +65,6 @@ config PCI_POWERNV select PCI_EXPRESS select MSI_NONBROKEN select PCIE_PORT + +config REMOTE_PCIHOST + bool diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build index da9d1a9964..1847c69905 100644 --- a/hw/pci-host/meson.build +++ b/hw/pci-host/meson.build @@ -9,6 +9,7 @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c')) pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c')) pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c')) pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c')) +pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c')) # PPC devices pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c')) diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c new file mode 100644 index 0000000000..eee45444ef --- /dev/null +++ b/hw/pci-host/remote.c @@ -0,0 +1,75 @@ +/* + * Remote PCI host device + * + * Unlike PCI host devices that model physical hardware, the purpose + * of this PCI host is to host multi-process QEMU devices. + * + * Multi-process QEMU extends the PCI host of a QEMU machine into a + * remote process. Any PCI device attached to the remote process is + * visible in the QEMU guest. This allows existing QEMU device models + * to be reused in the remote process. + * + * This PCI host is purely a container for PCI devices. It's fake in the + * sense that the guest never sees this PCI host and has no way of + * accessing it. Its job is just to provide the environment that QEMU + * PCI device models need when running in a remote process. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/pci/pci.h" +#include "hw/pci/pci_host.h" +#include "hw/pci/pcie_host.h" +#include "hw/qdev-properties.h" +#include "hw/pci-host/remote.h" +#include "exec/memory.h" + +static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge, + PCIBus *rootbus) +{ + return "0000:00"; +} + +static void remote_pcihost_realize(DeviceState *dev, Error **errp) +{ + PCIHostState *pci = PCI_HOST_BRIDGE(dev); + RemotePCIHost *s = REMOTE_PCIHOST(dev); + + pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci", + s->mr_pci_mem, s->mr_sys_io, + 0, TYPE_PCIE_BUS); +} + +static void remote_pcihost_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass); + + hc->root_bus_path = remote_pcihost_root_bus_path; + dc->realize = remote_pcihost_realize; + + dc->user_creatable = false; + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + dc->fw_name = "pci"; +} + +static const TypeInfo remote_pcihost_info = { + .name = TYPE_REMOTE_PCIHOST, + .parent = TYPE_PCIE_HOST_BRIDGE, + .instance_size = sizeof(RemotePCIHost), + .class_init = remote_pcihost_class_init, +}; + +static void remote_pcihost_register(void) +{ + type_register_static(&remote_pcihost_info); +} + +type_init(remote_pcihost_register) diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig new file mode 100644 index 0000000000..08c16e235f --- /dev/null +++ b/hw/remote/Kconfig @@ -0,0 +1,4 @@ +config MULTIPROCESS + bool + depends on PCI && PCI_EXPRESS && KVM + select REMOTE_PCIHOST diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c new file mode 100644 index 0000000000..e4ff131a6b --- /dev/null +++ b/hw/remote/iohub.c @@ -0,0 +1,119 @@ +/* + * Remote IO Hub + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/pci/pci.h" +#include "hw/pci/pci_ids.h" +#include "hw/pci/pci_bus.h" +#include "qemu/thread.h" +#include "hw/boards.h" +#include "hw/remote/machine.h" +#include "hw/remote/iohub.h" +#include "qemu/main-loop.h" + +void remote_iohub_init(RemoteIOHubState *iohub) +{ + int pirq; + + memset(&iohub->irqfds, 0, sizeof(iohub->irqfds)); + memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds)); + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_mutex_init(&iohub->irq_level_lock[pirq]); + iohub->irq_level[pirq] = 0; + event_notifier_init_fd(&iohub->irqfds[pirq], -1); + event_notifier_init_fd(&iohub->resamplefds[pirq], -1); + } +} + +void remote_iohub_finalize(RemoteIOHubState *iohub) +{ + int pirq; + + for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + qemu_mutex_destroy(&iohub->irq_level_lock[pirq]); + } +} + +int remote_iohub_map_irq(PCIDevice *pci_dev, int intx) +{ + return pci_dev->devfn; +} + +void remote_iohub_set_irq(void *opaque, int pirq, int level) +{ + RemoteIOHubState *iohub = opaque; + + assert(pirq >= 0); + assert(pirq < PCI_DEVFN_MAX); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (level) { + if (++iohub->irq_level[pirq] == 1) { + event_notifier_set(&iohub->irqfds[pirq]); + } + } else if (iohub->irq_level[pirq] > 0) { + iohub->irq_level[pirq]--; + } +} + +static void intr_resample_handler(void *opaque) +{ + ResampleToken *token = opaque; + RemoteIOHubState *iohub = token->iohub; + int pirq, s; + + pirq = token->pirq; + + s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]); + + assert(s >= 0); + + QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]); + + if (iohub->irq_level[pirq]) { + event_notifier_set(&iohub->irqfds[pirq]); + } +} + +void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg) +{ + RemoteMachineState *machine = REMOTE_MACHINE(current_machine); + RemoteIOHubState *iohub = &machine->iohub; + int pirq, intx; + + intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + pirq = remote_iohub_map_irq(pci_dev, intx); + + if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) { + qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]), + NULL, NULL, NULL); + event_notifier_cleanup(&iohub->irqfds[pirq]); + event_notifier_cleanup(&iohub->resamplefds[pirq]); + memset(&iohub->token[pirq], 0, sizeof(ResampleToken)); + } + + event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]); + event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]); + + iohub->token[pirq].iohub = iohub; + iohub->token[pirq].pirq = pirq; + + qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL, + &iohub->token[pirq]); +} diff --git a/hw/remote/machine.c b/hw/remote/machine.c new file mode 100644 index 0000000000..c0ab4f528a --- /dev/null +++ b/hw/remote/machine.c @@ -0,0 +1,80 @@ +/* + * Machine for remote device + * + * This machine type is used by the remote device process in multi-process + * QEMU. QEMU device models depend on parent busses, interrupt controllers, + * memory regions, etc. The remote machine type offers this environment so + * that QEMU device models can be used as remote devices. + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/machine.h" +#include "exec/address-spaces.h" +#include "exec/memory.h" +#include "qapi/error.h" +#include "hw/pci/pci_host.h" +#include "hw/remote/iohub.h" + +static void remote_machine_init(MachineState *machine) +{ + MemoryRegion *system_memory, *system_io, *pci_memory; + RemoteMachineState *s = REMOTE_MACHINE(machine); + RemotePCIHost *rem_host; + PCIHostState *pci_host; + + system_memory = get_system_memory(); + system_io = get_system_io(); + + pci_memory = g_new(MemoryRegion, 1); + memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); + + rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST)); + + rem_host->mr_pci_mem = pci_memory; + rem_host->mr_sys_mem = system_memory; + rem_host->mr_sys_io = system_io; + + s->host = rem_host; + + object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host)); + memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1); + + qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal); + + pci_host = PCI_HOST_BRIDGE(rem_host); + + remote_iohub_init(&s->iohub); + + pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq, + &s->iohub, REMOTE_IOHUB_NB_PIRQS); +} + +static void remote_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + + mc->init = remote_machine_init; + mc->desc = "Experimental remote machine"; +} + +static const TypeInfo remote_machine = { + .name = TYPE_REMOTE_MACHINE, + .parent = TYPE_MACHINE, + .instance_size = sizeof(RemoteMachineState), + .class_init = remote_machine_class_init, +}; + +static void remote_machine_register_types(void) +{ + type_register_static(&remote_machine); +} + +type_init(remote_machine_register_types); diff --git a/hw/remote/memory.c b/hw/remote/memory.c new file mode 100644 index 0000000000..32085b1e05 --- /dev/null +++ b/hw/remote/memory.c @@ -0,0 +1,65 @@ +/* + * Memory manager for remote device + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/memory.h" +#include "exec/address-spaces.h" +#include "exec/ram_addr.h" +#include "qapi/error.h" + +static void remote_sysmem_reset(void) +{ + MemoryRegion *sysmem, *subregion, *next; + + sysmem = get_system_memory(); + + QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) { + if (subregion->ram) { + memory_region_del_subregion(sysmem, subregion); + object_unparent(OBJECT(subregion)); + } + } +} + +void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem; + MemoryRegion *sysmem, *subregion; + static unsigned int suffix; + int region; + + sysmem = get_system_memory(); + + remote_sysmem_reset(); + + for (region = 0; region < msg->num_fds; region++) { + g_autofree char *name; + subregion = g_new(MemoryRegion, 1); + name = g_strdup_printf("remote-mem-%u", suffix++); + memory_region_init_ram_from_fd(subregion, NULL, + name, sysmem_info->sizes[region], + true, msg->fds[region], + sysmem_info->offsets[region], + errp); + + if (*errp) { + g_free(subregion); + remote_sysmem_reset(); + return; + } + + memory_region_add_subregion(sysmem, sysmem_info->gpas[region], + subregion); + + } +} diff --git a/hw/remote/meson.build b/hw/remote/meson.build new file mode 100644 index 0000000000..e6a5574242 --- /dev/null +++ b/hw/remote/meson.build @@ -0,0 +1,13 @@ +remote_ss = ss.source_set() + +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c')) +remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c')) + +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c')) +specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c')) + +softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss) diff --git a/hw/remote/message.c b/hw/remote/message.c new file mode 100644 index 0000000000..11d729845c --- /dev/null +++ b/hw/remote/message.c @@ -0,0 +1,230 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/machine.h" +#include "io/channel.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "sysemu/runstate.h" +#include "hw/pci/pci.h" +#include "exec/memattrs.h" +#include "hw/remote/memory.h" +#include "hw/remote/iohub.h" +#include "sysemu/reset.h" + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp); +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp); +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp); + +void coroutine_fn mpqemu_remote_msg_loop_co(void *data) +{ + g_autofree RemoteCommDev *com = (RemoteCommDev *)data; + PCIDevice *pci_dev = NULL; + Error *local_err = NULL; + + assert(com->ioc); + + pci_dev = com->dev; + for (; !local_err;) { + MPQemuMsg msg = {0}; + + if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) { + break; + } + + if (!mpqemu_msg_valid(&msg)) { + error_setg(&local_err, "Received invalid message from proxy" + "in remote process pid="FMT_pid"", + getpid()); + break; + } + + switch (msg.cmd) { + case MPQEMU_CMD_PCI_CFGWRITE: + process_config_write(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_PCI_CFGREAD: + process_config_read(com->ioc, pci_dev, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_WRITE: + process_bar_write(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_BAR_READ: + process_bar_read(com->ioc, &msg, &local_err); + break; + case MPQEMU_CMD_SYNC_SYSMEM: + remote_sysmem_reconfig(&msg, &local_err); + break; + case MPQEMU_CMD_SET_IRQFD: + process_set_irqfd_msg(pci_dev, &msg); + break; + case MPQEMU_CMD_DEVICE_RESET: + process_device_reset_msg(com->ioc, pci_dev, &local_err); + break; + default: + error_setg(&local_err, + "Unknown command (%d) received for device %s" + " (pid="FMT_pid")", + msg.cmd, DEVICE(pci_dev)->id, getpid()); + } + } + + if (local_err) { + error_report_err(local_err); + qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR); + } else { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } +} + +static void process_config_write(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + pci_default_write_config(dev, conf->addr, conf->val, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_config_read(QIOChannel *ioc, PCIDevice *dev, + MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data; + MPQemuMsg ret = { 0 }; + + if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) { + error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".", + getpid()); + ret.data.u64 = UINT64_MAX; + } else { + ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len); + } + + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + AddressSpace *as = + bar_access->memory ? &address_space_memory : &address_space_io; + MPQemuMsg ret = { 0 }; + MemTxResult res; + uint64_t val; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + ret.data.u64 = UINT64_MAX; + goto fail; + } + + val = cpu_to_le64(bar_access->val); + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, true); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".", + bar_access->addr, getpid()); + ret.data.u64 = -1; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp) +{ + ERRP_GUARD(); + BarAccessMsg *bar_access = &msg->data.bar_access; + MPQemuMsg ret = { 0 }; + AddressSpace *as; + MemTxResult res; + uint64_t val = 0; + + as = bar_access->memory ? &address_space_memory : &address_space_io; + + if (!is_power_of_2(bar_access->size) || + (bar_access->size > sizeof(uint64_t))) { + val = UINT64_MAX; + goto fail; + } + + res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED, + (void *)&val, bar_access->size, false); + + if (res != MEMTX_OK) { + error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".", + bar_access->addr, getpid()); + val = UINT64_MAX; + } + +fail: + ret.cmd = MPQEMU_CMD_RET; + ret.data.u64 = le64_to_cpu(val); + ret.size = sizeof(ret.data.u64); + + if (!mpqemu_msg_send(&ret, ioc, NULL)) { + error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ", + getpid()); + } +} + +static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev, + Error **errp) +{ + DeviceClass *dc = DEVICE_GET_CLASS(dev); + DeviceState *s = DEVICE(dev); + MPQemuMsg ret = { 0 }; + + if (dc->reset) { + dc->reset(s); + } + + ret.cmd = MPQEMU_CMD_RET; + + mpqemu_msg_send(&ret, ioc, errp); +} diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c new file mode 100644 index 0000000000..9ce31526e8 --- /dev/null +++ b/hw/remote/mpqemu-link.c @@ -0,0 +1,267 @@ +/* + * Communication channel between QEMU and remote device process + * + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/module.h" +#include "hw/remote/mpqemu-link.h" +#include "qapi/error.h" +#include "qemu/iov.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "io/channel.h" +#include "sysemu/iothread.h" +#include "trace.h" + +/* + * Send message over the ioc QIOChannel. + * This function is safe to call from: + * - main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - IOThread within co-routine context, outside of co-routine context + * will block IOThread; + * Returns true if no errors were encountered, false otherwise. + */ +bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + ERRP_GUARD(); + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + struct iovec send[2] = {}; + int *fds = NULL; + size_t nfds = 0; + bool ret = false; + + send[0].iov_base = msg; + send[0].iov_len = MPQEMU_MSG_HDR_SIZE; + + send[1].iov_base = (void *)&msg->data; + send[1].iov_len = msg->size; + + if (msg->num_fds) { + nfds = msg->num_fds; + fds = msg->fds; + } + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + /* + * Skip unlocking/locking iothread lock when the IOThread is running + * in co-routine context. Co-routine context is asserted above + * for IOThread case. + * Also skip lock handling while in a co-routine in the main context. + */ + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send), + fds, nfds, errp)) { + ret = true; + } else { + trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds); + } + + if (iolock && !iothread && !qemu_in_coroutine()) { + /* See above comment why skip locking here. */ + qemu_mutex_lock_iothread(); + } + + return ret; +} + +/* + * Read message from the ioc QIOChannel. + * This function is safe to call from: + * - From main loop in co-routine context. Will block the main loop if not in + * co-routine context; + * - From vCPU thread with no co-routine context and if the channel is not part + * of the main loop handling; + * - From IOThread within co-routine context, outside of co-routine context + * will block IOThread; + */ +static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds, + size_t *nfds, Error **errp) +{ + ERRP_GUARD(); + struct iovec iov = { .iov_base = buf, .iov_len = len }; + bool iolock = qemu_mutex_iothread_locked(); + bool iothread = qemu_in_iothread(); + int ret = -1; + + /* + * Dont use in IOThread out of co-routine context as + * it will block IOThread. + */ + assert(qemu_in_coroutine() || !iothread); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_unlock_iothread(); + } + + ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp); + + if (iolock && !iothread && !qemu_in_coroutine()) { + qemu_mutex_lock_iothread(); + } + + return (ret <= 0) ? ret : iov.iov_len; +} + +bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp) +{ + ERRP_GUARD(); + g_autofree int *fds = NULL; + size_t nfds = 0; + ssize_t len; + bool ret = false; + + len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp); + if (len <= 0) { + goto fail; + } else if (len != MPQEMU_MSG_HDR_SIZE) { + error_setg(errp, "Message header corrupted"); + goto fail; + } + + if (msg->size > sizeof(msg->data)) { + error_setg(errp, "Invalid size for message"); + goto fail; + } + + if (!msg->size) { + goto copy_fds; + } + + len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp); + if (len <= 0) { + goto fail; + } + if (len != msg->size) { + error_setg(errp, "Unable to read full message"); + goto fail; + } + +copy_fds: + msg->num_fds = nfds; + if (nfds > G_N_ELEMENTS(msg->fds)) { + error_setg(errp, + "Overflow error: received %zu fds, more than max of %d fds", + nfds, REMOTE_MAX_FDS); + goto fail; + } + if (nfds) { + memcpy(msg->fds, fds, nfds * sizeof(int)); + } + + ret = true; + +fail: + if (*errp) { + trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds); + } + while (*errp && nfds) { + close(fds[nfds - 1]); + nfds--; + } + + return ret; +} + +/* + * Send msg and wait for a reply with command code RET_MSG. + * Returns the message received of size u64 or UINT64_MAX + * on error. + * Called from VCPU thread in non-coroutine context. + * Used by the Proxy object to communicate to remote processes. + */ +uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev, + Error **errp) +{ + ERRP_GUARD(); + MPQemuMsg msg_reply = {0}; + uint64_t ret = UINT64_MAX; + + assert(!qemu_in_coroutine()); + + QEMU_LOCK_GUARD(&pdev->io_mutex); + if (!mpqemu_msg_send(msg, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) { + return ret; + } + + if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) { + error_setg(errp, "ERROR: Invalid reply received for command %d", + msg->cmd); + return ret; + } + + return msg_reply.data.u64; +} + +bool mpqemu_msg_valid(MPQemuMsg *msg) +{ + if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) { + return false; + } + + /* Verify FDs. */ + if (msg->num_fds >= REMOTE_MAX_FDS) { + return false; + } + + if (msg->num_fds > 0) { + for (int i = 0; i < msg->num_fds; i++) { + if (fcntl(msg->fds[i], F_GETFL) == -1) { + return false; + } + } + } + + /* Verify message specific fields. */ + switch (msg->cmd) { + case MPQEMU_CMD_SYNC_SYSMEM: + if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) { + return false; + } + break; + case MPQEMU_CMD_PCI_CFGWRITE: + case MPQEMU_CMD_PCI_CFGREAD: + if (msg->size != sizeof(PciConfDataMsg)) { + return false; + } + break; + case MPQEMU_CMD_BAR_WRITE: + case MPQEMU_CMD_BAR_READ: + if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) { + return false; + } + break; + case MPQEMU_CMD_SET_IRQFD: + if (msg->size || (msg->num_fds != 2)) { + return false; + } + break; + default: + break; + } + + return true; +} diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c new file mode 100644 index 0000000000..af1fa6f5aa --- /dev/null +++ b/hw/remote/proxy-memory-listener.c @@ -0,0 +1,227 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/compiler.h" +#include "qemu/int128.h" +#include "qemu/range.h" +#include "exec/memory.h" +#include "exec/cpu-common.h" +#include "cpu.h" +#include "exec/ram_addr.h" +#include "exec/address-spaces.h" +#include "qapi/error.h" +#include "hw/remote/mpqemu-link.h" +#include "hw/remote/proxy-memory-listener.h" + +/* + * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and + * proxy_memory_listener_commit() defined below perform tasks similar to the + * functions defined in vhost-user.c. These functions are good candidates + * for refactoring. + * + */ + +static void proxy_memory_listener_reset(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + int mrs; + + for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) { + memory_region_unref(proxy_listener->mr_sections[mrs].mr); + } + + g_free(proxy_listener->mr_sections); + proxy_listener->mr_sections = NULL; + proxy_listener->n_mr_sections = 0; +} + +static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset) +{ + MemoryRegion *mr; + ram_addr_t off; + + /** + * Assumes that the host address is a valid address as it's + * coming from the MemoryListener system. In the case host + * address is not valid, the following call would return + * the default subregion of "system_memory" region, and + * not NULL. So it's not possible to check for NULL here. + */ + mr = memory_region_from_host((void *)(uintptr_t)host, &off); + + if (offset) { + *offset = off; + } + + return memory_region_get_fd(mr); +} + +static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size) +{ + if (((prev_host + size) != host)) { + return false; + } + + if (get_fd_from_hostaddr(host, NULL) != + get_fd_from_hostaddr(prev_host, NULL)) { + return false; + } + + return true; +} + +static bool try_merge(ProxyMemoryListener *proxy_listener, + MemoryRegionSection *section) +{ + uint64_t mrs_size, mrs_gpa, mrs_page; + MemoryRegionSection *prev_sec; + bool merged = false; + uintptr_t mrs_host; + RAMBlock *mrs_rb; + + if (!proxy_listener->n_mr_sections) { + return false; + } + + mrs_rb = section->mr->ram_block; + mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb); + mrs_size = int128_get64(section->size); + mrs_gpa = section->offset_within_address_space; + mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + + if (get_fd_from_hostaddr(mrs_host, NULL) < 0) { + return true; + } + + mrs_host = mrs_host & ~(mrs_page - 1); + mrs_gpa = mrs_gpa & ~(mrs_page - 1); + mrs_size = ROUND_UP(mrs_size, mrs_page); + + prev_sec = proxy_listener->mr_sections + + (proxy_listener->n_mr_sections - 1); + uint64_t prev_gpa_start = prev_sec->offset_within_address_space; + uint64_t prev_size = int128_get64(prev_sec->size); + uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size); + uint64_t prev_host_start = + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) + + prev_sec->offset_within_region; + uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); + + if (mrs_gpa <= (prev_gpa_end + 1)) { + g_assert(mrs_gpa > prev_gpa_start); + + if ((section->mr == prev_sec->mr) && + proxy_mrs_can_merge(mrs_host, prev_host_start, + (mrs_gpa - prev_gpa_start))) { + uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); + merged = true; + prev_sec->offset_within_address_space = + MIN(prev_gpa_start, mrs_gpa); + prev_sec->offset_within_region = + MIN(prev_host_start, mrs_host) - + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); + prev_sec->size = int128_make64(max_end - MIN(prev_host_start, + mrs_host)); + } + } + + return merged; +} + +static void proxy_memory_listener_region_addnop(MemoryListener *listener, + MemoryRegionSection *section) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + + if (!memory_region_is_ram(section->mr) || + memory_region_is_rom(section->mr)) { + return; + } + + if (try_merge(proxy_listener, section)) { + return; + } + + ++proxy_listener->n_mr_sections; + proxy_listener->mr_sections = g_renew(MemoryRegionSection, + proxy_listener->mr_sections, + proxy_listener->n_mr_sections); + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section; + proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL; + memory_region_ref(section->mr); +} + +static void proxy_memory_listener_commit(MemoryListener *listener) +{ + ProxyMemoryListener *proxy_listener = container_of(listener, + ProxyMemoryListener, + listener); + MPQemuMsg msg; + MemoryRegionSection *section; + ram_addr_t offset; + uintptr_t host_addr; + int region; + Error *local_err = NULL; + + memset(&msg, 0, sizeof(MPQemuMsg)); + + msg.cmd = MPQEMU_CMD_SYNC_SYSMEM; + msg.num_fds = proxy_listener->n_mr_sections; + msg.size = sizeof(SyncSysmemMsg); + if (msg.num_fds > REMOTE_MAX_FDS) { + error_report("Number of fds is more than %d", REMOTE_MAX_FDS); + return; + } + + for (region = 0; region < proxy_listener->n_mr_sections; region++) { + section = &proxy_listener->mr_sections[region]; + msg.data.sync_sysmem.gpas[region] = + section->offset_within_address_space; + msg.data.sync_sysmem.sizes[region] = int128_get64(section->size); + host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) + + section->offset_within_region; + msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset); + msg.data.sync_sysmem.offsets[region] = offset; + } + if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) { + error_report_err(local_err); + } +} + +void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener) +{ + memory_listener_unregister(&proxy_listener->listener); + + proxy_memory_listener_reset(&proxy_listener->listener); +} + +void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener, + QIOChannel *ioc) +{ + proxy_listener->n_mr_sections = 0; + proxy_listener->mr_sections = NULL; + + proxy_listener->ioc = ioc; + + proxy_listener->listener.begin = proxy_memory_listener_reset; + proxy_listener->listener.commit = proxy_memory_listener_commit; + proxy_listener->listener.region_add = proxy_memory_listener_region_addnop; + proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop; + proxy_listener->listener.priority = 10; + + memory_listener_register(&proxy_listener->listener, + &address_space_memory); +} diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c new file mode 100644 index 0000000000..4fa4be079d --- /dev/null +++ b/hw/remote/proxy.c @@ -0,0 +1,379 @@ +/* + * Copyright © 2018, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "hw/remote/proxy.h" +#include "hw/pci/pci.h" +#include "qapi/error.h" +#include "io/channel-util.h" +#include "hw/qdev-properties.h" +#include "monitor/monitor.h" +#include "migration/blocker.h" +#include "qemu/sockets.h" +#include "hw/remote/mpqemu-link.h" +#include "qemu/error-report.h" +#include "hw/remote/proxy-memory-listener.h" +#include "qom/object.h" +#include "qemu/event_notifier.h" +#include "sysemu/kvm.h" +#include "util/event_notifier-posix.c" + +static void probe_pci_info(PCIDevice *dev, Error **errp); +static void proxy_device_reset(DeviceState *dev); + +static void proxy_intx_update(PCIDevice *pci_dev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev); + PCIINTxRoute route; + int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1; + + if (dev->virq != -1) { + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq); + dev->virq = -1; + } + + route = pci_device_route_intx_to_irq(pci_dev, pin); + + dev->virq = route.irq; + + if (dev->virq != -1) { + kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr, + &dev->resample, dev->virq); + } +} + +static void setup_irqfd(PCIProxyDev *dev) +{ + PCIDevice *pci_dev = PCI_DEVICE(dev); + MPQemuMsg msg; + Error *local_err = NULL; + + event_notifier_init(&dev->intr, 0); + event_notifier_init(&dev->resample, 0); + + memset(&msg, 0, sizeof(MPQemuMsg)); + msg.cmd = MPQEMU_CMD_SET_IRQFD; + msg.num_fds = 2; + msg.fds[0] = event_notifier_get_fd(&dev->intr); + msg.fds[1] = event_notifier_get_fd(&dev->resample); + msg.size = 0; + + if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) { + error_report_err(local_err); + } + + dev->virq = -1; + + proxy_intx_update(pci_dev); + + pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update); +} + +static void pci_proxy_dev_realize(PCIDevice *device, Error **errp) +{ + ERRP_GUARD(); + PCIProxyDev *dev = PCI_PROXY_DEV(device); + uint8_t *pci_conf = device->config; + int fd; + + if (!dev->fd) { + error_setg(errp, "fd parameter not specified for %s", + DEVICE(device)->id); + return; + } + + fd = monitor_fd_param(monitor_cur(), dev->fd, errp); + if (fd == -1) { + error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "proxy: fd %d is not a socket", fd); + close(fd); + return; + } + + dev->ioc = qio_channel_new_fd(fd, errp); + + error_setg(&dev->migration_blocker, "%s does not support migration", + TYPE_PCI_PROXY_DEV); + migrate_add_blocker(dev->migration_blocker, errp); + + qemu_mutex_init(&dev->io_mutex); + qio_channel_set_blocking(dev->ioc, true, NULL); + + pci_conf[PCI_LATENCY_TIMER] = 0xff; + pci_conf[PCI_INTERRUPT_PIN] = 0x01; + + proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc); + + setup_irqfd(dev); + + probe_pci_info(PCI_DEVICE(dev), errp); +} + +static void pci_proxy_dev_exit(PCIDevice *pdev) +{ + PCIProxyDev *dev = PCI_PROXY_DEV(pdev); + + if (dev->ioc) { + qio_channel_close(dev->ioc, NULL); + } + + migrate_del_blocker(dev->migration_blocker); + + error_free(dev->migration_blocker); + + proxy_memory_listener_deconfigure(&dev->proxy_listener); + + event_notifier_cleanup(&dev->intr); + event_notifier_cleanup(&dev->resample); +} + +static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val, + int len, unsigned int op) +{ + MPQemuMsg msg = { 0 }; + uint64_t ret = -EINVAL; + Error *local_err = NULL; + + msg.cmd = op; + msg.data.pci_conf_data.addr = addr; + msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0; + msg.data.pci_conf_data.len = len; + msg.size = sizeof(PciConfDataMsg); + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (ret == UINT64_MAX) { + error_report("Failed to perform PCI config %s operation", + (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE"); + } + + if (op == MPQEMU_CMD_PCI_CFGREAD) { + *val = (uint32_t)ret; + } +} + +static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len) +{ + uint32_t val; + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD); + + return val; +} + +static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val, + int len) +{ + /* + * Some of the functions access the copy of remote device's PCI config + * space which is cached in the proxy device. Therefore, maintain + * it updated. + */ + pci_default_write_config(d, addr, val, len); + + config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE); +} + +static Property proxy_properties[] = { + DEFINE_PROP_STRING("fd", PCIProxyDev, fd), + DEFINE_PROP_END_OF_LIST(), +}; + +static void pci_proxy_dev_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = pci_proxy_dev_realize; + k->exit = pci_proxy_dev_exit; + k->config_read = pci_proxy_read_config; + k->config_write = pci_proxy_write_config; + + dc->reset = proxy_device_reset; + + device_class_set_props(dc, proxy_properties); +} + +static const TypeInfo pci_proxy_dev_type_info = { + .name = TYPE_PCI_PROXY_DEV, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(PCIProxyDev), + .class_init = pci_proxy_dev_class_init, + .interfaces = (InterfaceInfo[]) { + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { }, + }, +}; + +static void pci_proxy_dev_register_types(void) +{ + type_register_static(&pci_proxy_dev_type_info); +} + +type_init(pci_proxy_dev_register_types) + +static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr, + bool write, hwaddr addr, uint64_t *val, + unsigned size, bool memory) +{ + MPQemuMsg msg = { 0 }; + long ret = -EINVAL; + Error *local_err = NULL; + + msg.size = sizeof(BarAccessMsg); + msg.data.bar_access.addr = mr->addr + addr; + msg.data.bar_access.size = size; + msg.data.bar_access.memory = memory; + + if (write) { + msg.cmd = MPQEMU_CMD_BAR_WRITE; + msg.data.bar_access.val = *val; + } else { + msg.cmd = MPQEMU_CMD_BAR_READ; + } + + ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + + if (!write) { + *val = ret; + } +} + +static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val, + unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + + send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size, + pmr->memory); +} + +static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size) +{ + ProxyMemoryRegion *pmr = opaque; + uint64_t val; + + send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size, + pmr->memory); + + return val; +} + +const MemoryRegionOps proxy_mr_ops = { + .read = proxy_bar_read, + .write = proxy_bar_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +static void probe_pci_info(PCIDevice *dev, Error **errp) +{ + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); + uint32_t orig_val, new_val, base_class, val; + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + DeviceClass *dc = DEVICE_CLASS(pc); + uint8_t type; + int i, size; + + config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->vendor_id = (uint16_t)val; + + config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->device_id = (uint16_t)val; + + config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->class_id = (uint16_t)val; + + config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD); + pc->subsystem_id = (uint16_t)val; + + base_class = pc->class_id >> 4; + switch (base_class) { + case PCI_BASE_CLASS_BRIDGE: + set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); + break; + case PCI_BASE_CLASS_STORAGE: + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + break; + case PCI_BASE_CLASS_NETWORK: + set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); + break; + case PCI_BASE_CLASS_INPUT: + set_bit(DEVICE_CATEGORY_INPUT, dc->categories); + break; + case PCI_BASE_CLASS_DISPLAY: + set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories); + break; + case PCI_BASE_CLASS_PROCESSOR: + set_bit(DEVICE_CATEGORY_CPU, dc->categories); + break; + default: + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + break; + } + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + new_val = 0xffffffff; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4, + MPQEMU_CMD_PCI_CFGREAD); + size = (~(new_val & 0xFFFFFFF0)) + 1; + config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4, + MPQEMU_CMD_PCI_CFGWRITE); + type = (new_val & 0x1) ? + PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY; + + if (size) { + g_autofree char *name; + pdev->region[i].dev = pdev; + pdev->region[i].present = true; + if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) { + pdev->region[i].memory = true; + } + name = g_strdup_printf("bar-region-%d", i); + memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev), + &proxy_mr_ops, &pdev->region[i], + name, size); + pci_register_bar(dev, i, type, &pdev->region[i].mr); + } + } +} + +static void proxy_device_reset(DeviceState *dev) +{ + PCIProxyDev *pdev = PCI_PROXY_DEV(dev); + MPQemuMsg msg = { 0 }; + Error *local_err = NULL; + + msg.cmd = MPQEMU_CMD_DEVICE_RESET; + msg.size = 0; + + mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err); + if (local_err) { + error_report_err(local_err); + } + +} diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c new file mode 100644 index 0000000000..4f21254219 --- /dev/null +++ b/hw/remote/remote-obj.c @@ -0,0 +1,203 @@ +/* + * Copyright © 2020, 2021 Oracle and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL-v2, version 2 or later. + * + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#include "qemu/error-report.h" +#include "qemu/notify.h" +#include "qom/object_interfaces.h" +#include "hw/qdev-core.h" +#include "io/channel.h" +#include "hw/qdev-core.h" +#include "hw/remote/machine.h" +#include "io/channel-util.h" +#include "qapi/error.h" +#include "sysemu/sysemu.h" +#include "hw/pci/pci.h" +#include "qemu/sockets.h" +#include "monitor/monitor.h" + +#define TYPE_REMOTE_OBJECT "x-remote-object" +OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT) + +struct RemoteObjectClass { + ObjectClass parent_class; + + unsigned int nr_devs; + unsigned int max_devs; +}; + +struct RemoteObject { + /* private */ + Object parent; + + Notifier machine_done; + + int32_t fd; + char *devid; + + QIOChannel *ioc; + + DeviceState *dev; + DeviceListener listener; +}; + +static void remote_object_set_fd(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + int fd = -1; + + fd = monitor_fd_param(monitor_cur(), str, errp); + if (fd == -1) { + error_prepend(errp, "Could not parse remote object fd %s:", str); + return; + } + + if (!fd_is_socket(fd)) { + error_setg(errp, "File descriptor '%s' is not a socket", str); + close(fd); + return; + } + + o->fd = fd; +} + +static void remote_object_set_devid(Object *obj, const char *str, Error **errp) +{ + RemoteObject *o = REMOTE_OBJECT(obj); + + g_free(o->devid); + + o->devid = g_strdup(str); +} + +static void remote_object_unrealize_listener(DeviceListener *listener, + DeviceState *dev) +{ + RemoteObject *o = container_of(listener, RemoteObject, listener); + + if (o->dev == dev) { + object_unref(OBJECT(o)); + } +} + +static void remote_object_machine_done(Notifier *notifier, void *data) +{ + RemoteObject *o = container_of(notifier, RemoteObject, machine_done); + DeviceState *dev = NULL; + QIOChannel *ioc = NULL; + Coroutine *co = NULL; + RemoteCommDev *comdev = NULL; + Error *err = NULL; + + dev = qdev_find_recursive(sysbus_get_default(), o->devid); + if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + error_report("%s is not a PCI device", o->devid); + return; + } + + ioc = qio_channel_new_fd(o->fd, &err); + if (!ioc) { + error_report_err(err); + return; + } + qio_channel_set_blocking(ioc, false, NULL); + + o->dev = dev; + + o->listener.unrealize = remote_object_unrealize_listener; + device_listener_register(&o->listener); + + /* co-routine should free this. */ + comdev = g_new0(RemoteCommDev, 1); + *comdev = (RemoteCommDev) { + .ioc = ioc, + .dev = PCI_DEVICE(dev), + }; + + co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev); + qemu_coroutine_enter(co); +} + +static void remote_object_init(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + if (k->nr_devs >= k->max_devs) { + error_report("Reached maximum number of devices: %u", k->max_devs); + return; + } + + o->ioc = NULL; + o->fd = -1; + o->devid = NULL; + + k->nr_devs++; + + o->machine_done.notify = remote_object_machine_done; + qemu_add_machine_init_done_notifier(&o->machine_done); +} + +static void remote_object_finalize(Object *obj) +{ + RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj); + RemoteObject *o = REMOTE_OBJECT(obj); + + device_listener_unregister(&o->listener); + + if (o->ioc) { + qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + qio_channel_close(o->ioc, NULL); + } + + object_unref(OBJECT(o->ioc)); + + k->nr_devs--; + g_free(o->devid); +} + +static void remote_object_class_init(ObjectClass *klass, void *data) +{ + RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass); + + /* + * Limit number of supported devices to 1. This is done to avoid devices + * from one VM accessing the RAM of another VM. This is done until we + * start using separate address spaces for individual devices. + */ + k->max_devs = 1; + k->nr_devs = 0; + + object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd); + object_class_property_add_str(klass, "devid", NULL, + remote_object_set_devid); +} + +static const TypeInfo remote_object_info = { + .name = TYPE_REMOTE_OBJECT, + .parent = TYPE_OBJECT, + .instance_size = sizeof(RemoteObject), + .instance_init = remote_object_init, + .instance_finalize = remote_object_finalize, + .class_size = sizeof(RemoteObjectClass), + .class_init = remote_object_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&remote_object_info); +} + +type_init(register_types); diff --git a/hw/remote/trace-events b/hw/remote/trace-events new file mode 100644 index 0000000000..0b23974f90 --- /dev/null +++ b/hw/remote/trace-events @@ -0,0 +1,4 @@ +# multi-process trace events + +mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process" +mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process" diff --git a/hw/remote/trace.h b/hw/remote/trace.h new file mode 100644 index 0000000000..5d5e3ac720 --- /dev/null +++ b/hw/remote/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_remote.h" |