diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-06-30 11:55:32 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-06-30 11:55:32 +0100 |
commit | 275845ae65fdfe1f84484fd1d2ca274ce80d7aaf (patch) | |
tree | c8fe8c6c211008a9335b656235ab9187ffa524b1 | |
parent | ce59ecc411fcde327e35364411f9e9ebbf96cab1 (diff) | |
parent | 802abf4024d23e48d45373ac3f2b580124b54b47 (diff) | |
download | qemu-275845ae65fdfe1f84484fd1d2ca274ce80d7aaf.zip |
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20180629' into staging
target-arm queue:
* last of the SVE patches; SVE is now enabled for aarch64 linux-user
* sd: Don't trace SDRequest crc field (coverity bugfix)
* target/arm: Mark PMINTENSET accesses as possibly doing IO
* clean up v7VE feature bit handling
* i.mx7d: minor cleanups
* target/arm: support reading of CNT[VCT|FRQ]_EL0 from user-space
* target/arm: Implement ARMv8.2-DotProd
* virt: add addresses to dt node names (which stops dtc from
complaining that they're not correctly named)
* cleanups: replace error_setg(&error_fatal) by error_report() + exit()
# gpg: Signature made Fri 29 Jun 2018 15:52:21 BST
# gpg: using RSA key 3C2525ED14360CDE
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>"
# gpg: aka "Peter Maydell <pmaydell@gmail.com>"
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>"
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* remotes/pmaydell/tags/pull-target-arm-20180629: (55 commits)
target/arm: Add ID_ISAR6
target/arm: Prune a15 features from max
target/arm: Prune a57 features from max
target/arm: Fix SVE system register access checks
target/arm: Fix SVE signed division vs x86 overflow exception
sdcard: Use the ldst API
sd: Don't trace SDRequest crc field
target/arm: Mark PMINTENSET accesses as possibly doing IO
target/arm: Remove redundant DIV detection for KVM
target/arm: Add ARM_FEATURE_V7VE for v7 Virtualization Extensions
i.mx7d: Change IRQ number type from hwaddr to int
i.mx7d: Change SRC unimplemented device name from sdma to src
i.mx7d: Remove unused header files
target/arm: support reading of CNT[VCT|FRQ]_EL0 from user-space
target/arm: Implement ARMv8.2-DotProd
target/arm: Enable SVE for aarch64-linux-user
target/arm: Implement SVE dot product (indexed)
target/arm: Implement SVE dot product (vectors)
target/arm: Implement SVE fp complex multiply add (indexed)
target/arm: Pass index to AdvSIMD FCMLA (indexed)
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | device_tree.c | 78 | ||||
-rw-r--r-- | hw/arm/boot.c | 41 | ||||
-rw-r--r-- | hw/arm/fsl-imx7.c | 8 | ||||
-rw-r--r-- | hw/arm/mcimx7d-sabre.c | 2 | ||||
-rw-r--r-- | hw/arm/sysbus-fdt.c | 53 | ||||
-rw-r--r-- | hw/arm/virt.c | 70 | ||||
-rw-r--r-- | hw/block/fdc.c | 9 | ||||
-rw-r--r-- | hw/sd/bcm2835_sdhost.c | 13 | ||||
-rw-r--r-- | hw/sd/core.c | 2 | ||||
-rw-r--r-- | hw/sd/milkymist-memcard.c | 3 | ||||
-rw-r--r-- | hw/sd/omap_mmc.c | 6 | ||||
-rw-r--r-- | hw/sd/pl181.c | 11 | ||||
-rw-r--r-- | hw/sd/sdhci.c | 15 | ||||
-rw-r--r-- | hw/sd/ssi-sd.c | 6 | ||||
-rw-r--r-- | hw/sd/trace-events | 2 | ||||
-rw-r--r-- | include/sysemu/device_tree.h | 16 | ||||
-rw-r--r-- | linux-user/elfload.c | 2 | ||||
-rw-r--r-- | target/arm/cpu.c | 36 | ||||
-rw-r--r-- | target/arm/cpu.h | 3 | ||||
-rw-r--r-- | target/arm/cpu64.c | 13 | ||||
-rw-r--r-- | target/arm/helper-sve.h | 682 | ||||
-rw-r--r-- | target/arm/helper.c | 44 | ||||
-rw-r--r-- | target/arm/helper.h | 44 | ||||
-rw-r--r-- | target/arm/kvm32.c | 27 | ||||
-rw-r--r-- | target/arm/sve.decode | 427 | ||||
-rw-r--r-- | target/arm/sve_helper.c | 1875 | ||||
-rw-r--r-- | target/arm/translate-a64.c | 62 | ||||
-rw-r--r-- | target/arm/translate-sve.c | 1688 | ||||
-rw-r--r-- | target/arm/translate.c | 102 | ||||
-rw-r--r-- | target/arm/vec_helper.c | 311 |
30 files changed, 5394 insertions, 257 deletions
diff --git a/device_tree.c b/device_tree.c index 52c3358a55..6d9c9726f6 100644 --- a/device_tree.c +++ b/device_tree.c @@ -140,15 +140,16 @@ static void read_fstree(void *fdt, const char *dirname) const char *parent_node; if (strstr(dirname, root_dir) != dirname) { - error_setg(&error_fatal, "%s: %s must be searched within %s", - __func__, dirname, root_dir); + error_report("%s: %s must be searched within %s", + __func__, dirname, root_dir); + exit(1); } parent_node = &dirname[strlen(SYSFS_DT_BASEDIR)]; d = opendir(dirname); if (!d) { - error_setg(&error_fatal, "%s cannot open %s", __func__, dirname); - return; + error_report("%s cannot open %s", __func__, dirname); + exit(1); } while ((de = readdir(d)) != NULL) { @@ -162,7 +163,8 @@ static void read_fstree(void *fdt, const char *dirname) tmpnam = g_strdup_printf("%s/%s", dirname, de->d_name); if (lstat(tmpnam, &st) < 0) { - error_setg(&error_fatal, "%s cannot lstat %s", __func__, tmpnam); + error_report("%s cannot lstat %s", __func__, tmpnam); + exit(1); } if (S_ISREG(st.st_mode)) { @@ -170,8 +172,9 @@ static void read_fstree(void *fdt, const char *dirname) gsize len; if (!g_file_get_contents(tmpnam, &val, &len, NULL)) { - error_setg(&error_fatal, "%s not able to extract info from %s", - __func__, tmpnam); + error_report("%s not able to extract info from %s", + __func__, tmpnam); + exit(1); } if (strlen(parent_node) > 0) { @@ -206,9 +209,9 @@ void *load_device_tree_from_sysfs(void) host_fdt = create_device_tree(&host_fdt_size); read_fstree(host_fdt, SYSFS_DT_BASEDIR); if (fdt_check_header(host_fdt)) { - error_setg(&error_fatal, - "%s host device tree extracted into memory is invalid", - __func__); + error_report("%s host device tree extracted into memory is invalid", + __func__); + exit(1); } return host_fdt; } @@ -229,6 +232,61 @@ static int findnode_nofail(void *fdt, const char *node_path) return offset; } +char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp) +{ + char *prefix = g_strdup_printf("%s@", name); + unsigned int path_len = 16, n = 0; + GSList *path_list = NULL, *iter; + const char *iter_name; + int offset, len, ret; + char **path_array; + + offset = fdt_next_node(fdt, -1, NULL); + + while (offset >= 0) { + iter_name = fdt_get_name(fdt, offset, &len); + if (!iter_name) { + offset = len; + break; + } + if (!strcmp(iter_name, name) || g_str_has_prefix(iter_name, prefix)) { + char *path; + + path = g_malloc(path_len); + while ((ret = fdt_get_path(fdt, offset, path, path_len)) + == -FDT_ERR_NOSPACE) { + path_len += 16; + path = g_realloc(path, path_len); + } + path_list = g_slist_prepend(path_list, path); + n++; + } + offset = fdt_next_node(fdt, offset, NULL); + } + g_free(prefix); + + if (offset < 0 && offset != -FDT_ERR_NOTFOUND) { + error_setg(errp, "%s: abort parsing dt for %s node units: %s", + __func__, name, fdt_strerror(offset)); + for (iter = path_list; iter; iter = iter->next) { + g_free(iter->data); + } + g_slist_free(path_list); + return NULL; + } + + path_array = g_new(char *, n + 1); + path_array[n--] = NULL; + + for (iter = path_list; iter; iter = iter->next) { + path_array[n--] = iter->data; + } + + g_slist_free(path_list); + + return path_array; +} + char **qemu_fdt_node_path(void *fdt, const char *name, char *compat, Error **errp) { diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 1e481662ad..e09201cc97 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -490,11 +490,13 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, hwaddr addr_limit, AddressSpace *as) { void *fdt = NULL; - int size, rc; + int size, rc, n = 0; uint32_t acells, scells; char *nodename; unsigned int i; hwaddr mem_base, mem_len; + char **node_path; + Error *err = NULL; if (binfo->dtb_filename) { char *filename; @@ -546,12 +548,21 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, goto fail; } + /* nop all root nodes matching /memory or /memory@unit-address */ + node_path = qemu_fdt_node_unit_path(fdt, "memory", &err); + if (err) { + error_report_err(err); + goto fail; + } + while (node_path[n]) { + if (g_str_has_prefix(node_path[n], "/memory")) { + qemu_fdt_nop_node(fdt, node_path[n]); + } + n++; + } + g_strfreev(node_path); + if (nb_numa_nodes > 0) { - /* - * Turn the /memory node created before into a NOP node, then create - * /memory@addr nodes for all numa nodes respectively. - */ - qemu_fdt_nop_node(fdt, "/memory"); mem_base = binfo->loader_start; for (i = 0; i < nb_numa_nodes; i++) { mem_len = numa_info[i].node_mem; @@ -572,24 +583,18 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo, g_free(nodename); } } else { - Error *err = NULL; - - rc = fdt_path_offset(fdt, "/memory"); - if (rc < 0) { - qemu_fdt_add_subnode(fdt, "/memory"); - } - - if (!qemu_fdt_getprop(fdt, "/memory", "device_type", NULL, &err)) { - qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory"); - } + nodename = g_strdup_printf("/memory@%" PRIx64, binfo->loader_start); + qemu_fdt_add_subnode(fdt, nodename); + qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory"); - rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg", + rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg", acells, binfo->loader_start, scells, binfo->ram_size); if (rc < 0) { - fprintf(stderr, "couldn't set /memory/reg\n"); + fprintf(stderr, "couldn't set %s reg\n", nodename); goto fail; } + g_free(nodename); } rc = fdt_path_offset(fdt, "/chosen"); diff --git a/hw/arm/fsl-imx7.c b/hw/arm/fsl-imx7.c index 26c1d27f7c..44fde03cbe 100644 --- a/hw/arm/fsl-imx7.c +++ b/hw/arm/fsl-imx7.c @@ -324,7 +324,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp) FSL_IMX7_ECSPI4_ADDR, }; - static const hwaddr FSL_IMX7_SPIn_IRQ[FSL_IMX7_NUM_ECSPIS] = { + static const int FSL_IMX7_SPIn_IRQ[FSL_IMX7_NUM_ECSPIS] = { FSL_IMX7_ECSPI1_IRQ, FSL_IMX7_ECSPI2_IRQ, FSL_IMX7_ECSPI3_IRQ, @@ -349,7 +349,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp) FSL_IMX7_I2C4_ADDR, }; - static const hwaddr FSL_IMX7_I2Cn_IRQ[FSL_IMX7_NUM_I2CS] = { + static const int FSL_IMX7_I2Cn_IRQ[FSL_IMX7_NUM_I2CS] = { FSL_IMX7_I2C1_IRQ, FSL_IMX7_I2C2_IRQ, FSL_IMX7_I2C3_IRQ, @@ -459,7 +459,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp) /* * SRC */ - create_unimplemented_device("sdma", FSL_IMX7_SRC_ADDR, FSL_IMX7_SRC_SIZE); + create_unimplemented_device("src", FSL_IMX7_SRC_ADDR, FSL_IMX7_SRC_SIZE); /* * Watchdog @@ -515,7 +515,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp) FSL_IMX7_USB3_ADDR, }; - static const hwaddr FSL_IMX7_USBn_IRQ[FSL_IMX7_NUM_USBS] = { + static const int FSL_IMX7_USBn_IRQ[FSL_IMX7_NUM_USBS] = { FSL_IMX7_USB1_IRQ, FSL_IMX7_USB2_IRQ, FSL_IMX7_USB3_IRQ, diff --git a/hw/arm/mcimx7d-sabre.c b/hw/arm/mcimx7d-sabre.c index 95fb409d9c..9c5f0e70c3 100644 --- a/hw/arm/mcimx7d-sabre.c +++ b/hw/arm/mcimx7d-sabre.c @@ -18,10 +18,8 @@ #include "hw/arm/fsl-imx7.h" #include "hw/boards.h" #include "sysemu/sysemu.h" -#include "sysemu/device_tree.h" #include "qemu/error-report.h" #include "sysemu/qtest.h" -#include "net/net.h" typedef struct { FslIMX7State soc; diff --git a/hw/arm/sysbus-fdt.c b/hw/arm/sysbus-fdt.c index 277ed872e7..0d4c75702c 100644 --- a/hw/arm/sysbus-fdt.c +++ b/hw/arm/sysbus-fdt.c @@ -92,16 +92,20 @@ static void copy_properties_from_host(HostProperty *props, int nb_props, r = qemu_fdt_getprop(host_fdt, node_path, props[i].name, &prop_len, - props[i].optional ? &err : &error_fatal); + &err); if (r) { qemu_fdt_setprop(guest_fdt, nodename, props[i].name, r, prop_len); } else { - if (prop_len != -FDT_ERR_NOTFOUND) { - /* optional property not returned although property exists */ - error_report_err(err); - } else { + if (props[i].optional && prop_len == -FDT_ERR_NOTFOUND) { + /* optional property does not exist */ error_free(err); + } else { + error_report_err(err); + } + if (!props[i].optional) { + /* mandatory property not found: bail out */ + exit(1); } } } @@ -138,9 +142,9 @@ static void fdt_build_clock_node(void *host_fdt, void *guest_fdt, node_offset = fdt_node_offset_by_phandle(host_fdt, host_phandle); if (node_offset <= 0) { - error_setg(&error_fatal, - "not able to locate clock handle %d in host device tree", - host_phandle); + error_report("not able to locate clock handle %d in host device tree", + host_phandle); + exit(1); } node_path = g_malloc(path_len); while ((ret = fdt_get_path(host_fdt, node_offset, node_path, path_len)) @@ -149,16 +153,16 @@ static void fdt_build_clock_node(void *host_fdt, void *guest_fdt, node_path = g_realloc(node_path, path_len); } if (ret < 0) { - error_setg(&error_fatal, - "not able to retrieve node path for clock handle %d", - host_phandle); + error_report("not able to retrieve node path for clock handle %d", + host_phandle); + exit(1); } r = qemu_fdt_getprop(host_fdt, node_path, "compatible", &prop_len, &error_fatal); if (strcmp(r, "fixed-clock")) { - error_setg(&error_fatal, - "clock handle %d is not a fixed clock", host_phandle); + error_report("clock handle %d is not a fixed clock", host_phandle); + exit(1); } nodename = strrchr(node_path, '/'); @@ -301,34 +305,37 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque) dt_name = sysfs_to_dt_name(vbasedev->name); if (!dt_name) { - error_setg(&error_fatal, "%s incorrect sysfs device name %s", - __func__, vbasedev->name); + error_report("%s incorrect sysfs device name %s", + __func__, vbasedev->name); + exit(1); } node_path = qemu_fdt_node_path(host_fdt, dt_name, vdev->compat, &error_fatal); if (!node_path || !node_path[0]) { - error_setg(&error_fatal, "%s unable to retrieve node path for %s/%s", - __func__, dt_name, vdev->compat); + error_report("%s unable to retrieve node path for %s/%s", + __func__, dt_name, vdev->compat); + exit(1); } if (node_path[1]) { - error_setg(&error_fatal, "%s more than one node matching %s/%s!", - __func__, dt_name, vdev->compat); + error_report("%s more than one node matching %s/%s!", + __func__, dt_name, vdev->compat); + exit(1); } g_free(dt_name); if (vbasedev->num_regions != 5) { - error_setg(&error_fatal, "%s Does the host dt node combine XGBE/PHY?", - __func__); + error_report("%s Does the host dt node combine XGBE/PHY?", __func__); + exit(1); } /* generate nodes for DMA_CLK and PTP_CLK */ r = qemu_fdt_getprop(host_fdt, node_path[0], "clocks", &prop_len, &error_fatal); if (prop_len != 8) { - error_setg(&error_fatal, "%s clocks property should contain 2 handles", - __func__); + error_report("%s clocks property should contain 2 handles", __func__); + exit(1); } host_clock_phandles = (uint32_t *)r; guest_clock_phandles[0] = qemu_fdt_alloc_phandle(guest_fdt); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 742f68afca..281ddcdf6e 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -204,13 +204,8 @@ static void create_fdt(VirtMachineState *vms) qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 0x2); qemu_fdt_setprop_cell(fdt, "/", "#size-cells", 0x2); - /* - * /chosen and /memory nodes must exist for load_dtb - * to fill in necessary properties later - */ + /* /chosen must exist for load_dtb to fill in necessary properties later */ qemu_fdt_add_subnode(fdt, "/chosen"); - qemu_fdt_add_subnode(fdt, "/memory"); - qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory"); /* Clock node, for the benefit of the UART. The kernel device tree * binding documentation claims the PL011 node clock properties are @@ -369,58 +364,72 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms) static void fdt_add_its_gic_node(VirtMachineState *vms) { + char *nodename; + vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt); - qemu_fdt_add_subnode(vms->fdt, "/intc/its"); - qemu_fdt_setprop_string(vms->fdt, "/intc/its", "compatible", + nodename = g_strdup_printf("/intc/its@%" PRIx64, + vms->memmap[VIRT_GIC_ITS].base); + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "arm,gic-v3-its"); - qemu_fdt_setprop(vms->fdt, "/intc/its", "msi-controller", NULL, 0); - qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/its", "reg", + qemu_fdt_setprop(vms->fdt, nodename, "msi-controller", NULL, 0); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, vms->memmap[VIRT_GIC_ITS].base, 2, vms->memmap[VIRT_GIC_ITS].size); - qemu_fdt_setprop_cell(vms->fdt, "/intc/its", "phandle", vms->msi_phandle); + qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->msi_phandle); + g_free(nodename); } static void fdt_add_v2m_gic_node(VirtMachineState *vms) { + char *nodename; + + nodename = g_strdup_printf("/intc/v2m@%" PRIx64, + vms->memmap[VIRT_GIC_V2M].base); vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt); - qemu_fdt_add_subnode(vms->fdt, "/intc/v2m"); - qemu_fdt_setprop_string(vms->fdt, "/intc/v2m", "compatible", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "arm,gic-v2m-frame"); - qemu_fdt_setprop(vms->fdt, "/intc/v2m", "msi-controller", NULL, 0); - qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/v2m", "reg", + qemu_fdt_setprop(vms->fdt, nodename, "msi-controller", NULL, 0); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, vms->memmap[VIRT_GIC_V2M].base, 2, vms->memmap[VIRT_GIC_V2M].size); - qemu_fdt_setprop_cell(vms->fdt, "/intc/v2m", "phandle", vms->msi_phandle); + qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->msi_phandle); + g_free(nodename); } static void fdt_add_gic_node(VirtMachineState *vms) { + char *nodename; + vms->gic_phandle = qemu_fdt_alloc_phandle(vms->fdt); qemu_fdt_setprop_cell(vms->fdt, "/", "interrupt-parent", vms->gic_phandle); - qemu_fdt_add_subnode(vms->fdt, "/intc"); - qemu_fdt_setprop_cell(vms->fdt, "/intc", "#interrupt-cells", 3); - qemu_fdt_setprop(vms->fdt, "/intc", "interrupt-controller", NULL, 0); - qemu_fdt_setprop_cell(vms->fdt, "/intc", "#address-cells", 0x2); - qemu_fdt_setprop_cell(vms->fdt, "/intc", "#size-cells", 0x2); - qemu_fdt_setprop(vms->fdt, "/intc", "ranges", NULL, 0); + nodename = g_strdup_printf("/intc@%" PRIx64, + vms->memmap[VIRT_GIC_DIST].base); + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#interrupt-cells", 3); + qemu_fdt_setprop(vms->fdt, nodename, "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#address-cells", 0x2); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#size-cells", 0x2); + qemu_fdt_setprop(vms->fdt, nodename, "ranges", NULL, 0); if (vms->gic_version == 3) { int nb_redist_regions = virt_gicv3_redist_region_count(vms); - qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible", + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "arm,gic-v3"); - qemu_fdt_setprop_cell(vms->fdt, "/intc", + qemu_fdt_setprop_cell(vms->fdt, nodename, "#redistributor-regions", nb_redist_regions); if (nb_redist_regions == 1) { - qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, vms->memmap[VIRT_GIC_DIST].base, 2, vms->memmap[VIRT_GIC_DIST].size, 2, vms->memmap[VIRT_GIC_REDIST].base, 2, vms->memmap[VIRT_GIC_REDIST].size); } else { - qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, vms->memmap[VIRT_GIC_DIST].base, 2, vms->memmap[VIRT_GIC_DIST].size, 2, vms->memmap[VIRT_GIC_REDIST].base, @@ -430,22 +439,23 @@ static void fdt_add_gic_node(VirtMachineState *vms) } if (vms->virt) { - qemu_fdt_setprop_cells(vms->fdt, "/intc", "interrupts", + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts", GIC_FDT_IRQ_TYPE_PPI, ARCH_GICV3_MAINT_IRQ, GIC_FDT_IRQ_FLAGS_LEVEL_HI); } } else { /* 'cortex-a15-gic' means 'GIC v2' */ - qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible", + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "arm,cortex-a15-gic"); - qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, vms->memmap[VIRT_GIC_DIST].base, 2, vms->memmap[VIRT_GIC_DIST].size, 2, vms->memmap[VIRT_GIC_CPU].base, 2, vms->memmap[VIRT_GIC_CPU].size); } - qemu_fdt_setprop_cell(vms->fdt, "/intc", "phandle", vms->gic_phandle); + qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->gic_phandle); + g_free(nodename); } static void fdt_add_pmu_nodes(const VirtMachineState *vms) diff --git a/hw/block/fdc.c b/hw/block/fdc.c index cd29e27d8f..7c1c57f57f 100644 --- a/hw/block/fdc.c +++ b/hw/block/fdc.c @@ -396,16 +396,9 @@ static int pick_geometry(FDrive *drv) nb_sectors, FloppyDriveType_str(parse->drive)); } + assert(type_match != -1 && "misconfigured fd_format"); match = type_match; } - - /* No match of any kind found -- fd_format is misconfigured, abort. */ - if (match == -1) { - error_setg(&error_abort, "No candidate geometries present in table " - " for floppy drive type '%s'", - FloppyDriveType_str(drv->drive)); - } - parse = &(fd_formats[match]); out: diff --git a/hw/sd/bcm2835_sdhost.c b/hw/sd/bcm2835_sdhost.c index ebf3b926c2..4df4de7d67 100644 --- a/hw/sd/bcm2835_sdhost.c +++ b/hw/sd/bcm2835_sdhost.c @@ -118,8 +118,6 @@ static void bcm2835_sdhost_send_command(BCM2835SDHostState *s) goto error; } if (!(s->cmd & SDCMD_NO_RESPONSE)) { -#define RWORD(n) (((uint32_t)rsp[n] << 24) | (rsp[n + 1] << 16) \ - | (rsp[n + 2] << 8) | rsp[n + 3]) if (rlen == 0 || (rlen == 4 && (s->cmd & SDCMD_LONG_RESPONSE))) { goto error; } @@ -127,15 +125,14 @@ static void bcm2835_sdhost_send_command(BCM2835SDHostState *s) goto error; } if (rlen == 4) { - s->rsp[0] = RWORD(0); + s->rsp[0] = ldl_be_p(&rsp[0]); s->rsp[1] = s->rsp[2] = s->rsp[3] = 0; } else { - s->rsp[0] = RWORD(12); - s->rsp[1] = RWORD(8); - s->rsp[2] = RWORD(4); - s->rsp[3] = RWORD(0); + s->rsp[0] = ldl_be_p(&rsp[12]); + s->rsp[1] = ldl_be_p(&rsp[8]); + s->rsp[2] = ldl_be_p(&rsp[4]); + s->rsp[3] = ldl_be_p(&rsp[0]); } -#undef RWORD } /* We never really delay commands, so if this was a 'busywait' command * then we've completed it now and can raise the interrupt. diff --git a/hw/sd/core.c b/hw/sd/core.c index 820345f704..107e6d71dd 100644 --- a/hw/sd/core.c +++ b/hw/sd/core.c @@ -91,7 +91,7 @@ int sdbus_do_command(SDBus *sdbus, SDRequest *req, uint8_t *response) { SDState *card = get_card(sdbus); - trace_sdbus_command(sdbus_name(sdbus), req->cmd, req->arg, req->crc); + trace_sdbus_command(sdbus_name(sdbus), req->cmd, req->arg); if (card) { SDCardClass *sc = SD_CARD_GET_CLASS(card); diff --git a/hw/sd/milkymist-memcard.c b/hw/sd/milkymist-memcard.c index fcbccf54ea..df42aa1c54 100644 --- a/hw/sd/milkymist-memcard.c +++ b/hw/sd/milkymist-memcard.c @@ -100,8 +100,7 @@ static void memcard_sd_command(MilkymistMemcardState *s) SDRequest req; req.cmd = s->command[0] & 0x3f; - req.arg = (s->command[1] << 24) | (s->command[2] << 16) - | (s->command[3] << 8) | s->command[4]; + req.arg = ldl_be_p(s->command + 1); req.crc = s->command[5]; s->response[0] = req.cmd; diff --git a/hw/sd/omap_mmc.c b/hw/sd/omap_mmc.c index aa2a816f76..671264b650 100644 --- a/hw/sd/omap_mmc.c +++ b/hw/sd/omap_mmc.c @@ -163,8 +163,7 @@ static void omap_mmc_command(struct omap_mmc_s *host, int cmd, int dir, CID_CSD_OVERWRITE; if (host->sdio & (1 << 13)) mask |= AKE_SEQ_ERROR; - rspstatus = (response[0] << 24) | (response[1] << 16) | - (response[2] << 8) | (response[3] << 0); + rspstatus = ldl_be_p(response); break; case sd_r2: @@ -182,8 +181,7 @@ static void omap_mmc_command(struct omap_mmc_s *host, int cmd, int dir, } rsplen = 4; - rspstatus = (response[0] << 24) | (response[1] << 16) | - (response[2] << 8) | (response[3] << 0); + rspstatus = ldl_be_p(response); if (rspstatus & 0x80000000) host->status &= 0xe000; else diff --git a/hw/sd/pl181.c b/hw/sd/pl181.c index 1cc94dbfdf..3ad7e925c5 100644 --- a/hw/sd/pl181.c +++ b/hw/sd/pl181.c @@ -182,23 +182,20 @@ static void pl181_send_command(PL181State *s) if (rlen < 0) goto error; if (s->cmd & PL181_CMD_RESPONSE) { -#define RWORD(n) (((uint32_t)response[n] << 24) | (response[n + 1] << 16) \ - | (response[n + 2] << 8) | response[n + 3]) if (rlen == 0 || (rlen == 4 && (s->cmd & PL181_CMD_LONGRESP))) goto error; if (rlen != 4 && rlen != 16) goto error; - s->response[0] = RWORD(0); + s->response[0] = ldl_be_p(&response[0]); if (rlen == 4) { s->response[1] = s->response[2] = s->response[3] = 0; } else { - s->response[1] = RWORD(4); - s->response[2] = RWORD(8); - s->response[3] = RWORD(12) & ~1; + s->response[1] = ldl_be_p(&response[4]); + s->response[2] = ldl_be_p(&response[8]); + s->response[3] = ldl_be_p(&response[12]) & ~1; } DPRINTF("Response received\n"); s->status |= PL181_STATUS_CMDRESPEND; -#undef RWORD } else { DPRINTF("Command sent\n"); s->status |= PL181_STATUS_CMDSENT; diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 3017e5a95a..321d02d75a 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -342,17 +342,13 @@ static void sdhci_send_command(SDHCIState *s) if (s->cmdreg & SDHC_CMD_RESPONSE) { if (rlen == 4) { - s->rspreg[0] = (response[0] << 24) | (response[1] << 16) | - (response[2] << 8) | response[3]; + s->rspreg[0] = ldl_be_p(response); s->rspreg[1] = s->rspreg[2] = s->rspreg[3] = 0; trace_sdhci_response4(s->rspreg[0]); } else if (rlen == 16) { - s->rspreg[0] = (response[11] << 24) | (response[12] << 16) | - (response[13] << 8) | response[14]; - s->rspreg[1] = (response[7] << 24) | (response[8] << 16) | - (response[9] << 8) | response[10]; - s->rspreg[2] = (response[3] << 24) | (response[4] << 16) | - (response[5] << 8) | response[6]; + s->rspreg[0] = ldl_be_p(&response[11]); + s->rspreg[1] = ldl_be_p(&response[7]); + s->rspreg[2] = ldl_be_p(&response[3]); s->rspreg[3] = (response[0] << 16) | (response[1] << 8) | response[2]; trace_sdhci_response16(s->rspreg[3], s->rspreg[2], @@ -396,8 +392,7 @@ static void sdhci_end_transfer(SDHCIState *s) trace_sdhci_end_transfer(request.cmd, request.arg); sdbus_do_command(&s->sdbus, &request, response); /* Auto CMD12 response goes to the upper Response register */ - s->rspreg[3] = (response[0] << 24) | (response[1] << 16) | - (response[2] << 8) | response[3]; + s->rspreg[3] = ldl_be_p(response); } s->prnsts &= ~(SDHC_DOING_READ | SDHC_DOING_WRITE | diff --git a/hw/sd/ssi-sd.c b/hw/sd/ssi-sd.c index 96542ecd62..95a143bfba 100644 --- a/hw/sd/ssi-sd.c +++ b/hw/sd/ssi-sd.c @@ -96,8 +96,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val) uint8_t longresp[16]; /* FIXME: Check CRC. */ request.cmd = s->cmd; - request.arg = (s->cmdarg[0] << 24) | (s->cmdarg[1] << 16) - | (s->cmdarg[2] << 8) | s->cmdarg[3]; + request.arg = ldl_be_p(s->cmdarg); DPRINTF("CMD%d arg 0x%08x\n", s->cmd, request.arg); s->arglen = sdbus_do_command(&s->sdbus, &request, longresp); if (s->arglen <= 0) { @@ -122,8 +121,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val) /* CMD13 returns a 2-byte statuse work. Other commands only return the first byte. */ s->arglen = (s->cmd == 13) ? 2 : 1; - cardstatus = (longresp[0] << 24) | (longresp[1] << 16) - | (longresp[2] << 8) | longresp[3]; + cardstatus = ldl_be_p(longresp); status = 0; if (((cardstatus >> 9) & 0xf) < 4) status |= SSI_SDR_IDLE; diff --git a/hw/sd/trace-events b/hw/sd/trace-events index bfd1d62efc..43cffab8b1 100644 --- a/hw/sd/trace-events +++ b/hw/sd/trace-events @@ -7,7 +7,7 @@ bcm2835_sdhost_edm_change(const char *why, uint32_t edm) "(%s) EDM now 0x%x" bcm2835_sdhost_update_irq(uint32_t irq) "IRQ bits 0x%x\n" # hw/sd/core.c -sdbus_command(const char *bus_name, uint8_t cmd, uint32_t arg, uint8_t crc) "@%s CMD%02d arg 0x%08x crc 0x%02x" +sdbus_command(const char *bus_name, uint8_t cmd, uint32_t arg) "@%s CMD%02d arg 0x%08x" sdbus_read(const char *bus_name, uint8_t value) "@%s value 0x%02x" sdbus_write(const char *bus_name, uint8_t value) "@%s value 0x%02x" sdbus_set_voltage(const char *bus_name, uint16_t millivolts) "@%s %u (mV)" diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h index e22e5bec9c..c16fd69bc0 100644 --- a/include/sysemu/device_tree.h +++ b/include/sysemu/device_tree.h @@ -43,6 +43,22 @@ void *load_device_tree_from_sysfs(void); char **qemu_fdt_node_path(void *fdt, const char *name, char *compat, Error **errp); +/** + * qemu_fdt_node_unit_path: return the paths of nodes matching a given + * node-name, ie. node-name and node-name@unit-address + * @fdt: pointer to the dt blob + * @name: node name + * @errp: handle to an error object + * + * returns a newly allocated NULL-terminated array of node paths. + * Use g_strfreev() to free it. If one or more nodes were found, the + * array contains the path of each node and the last element equals to + * NULL. If there is no error but no matching node was found, the + * returned array contains a single element equal to NULL. If an error + * was encountered when parsing the blob, the function returns NULL + */ +char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp); + int qemu_fdt_setprop(void *fdt, const char *node_path, const char *property, const void *val, int size); int qemu_fdt_setprop_cell(void *fdt, const char *node_path, diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 13bc78d0c8..942a1b661f 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -583,7 +583,9 @@ static uint32_t get_elf_hwcap(void) ARM_HWCAP_A64_FPHP | ARM_HWCAP_A64_ASIMDHP); GET_FEATURE(ARM_FEATURE_V8_ATOMICS, ARM_HWCAP_A64_ATOMICS); GET_FEATURE(ARM_FEATURE_V8_RDM, ARM_HWCAP_A64_ASIMDRDM); + GET_FEATURE(ARM_FEATURE_V8_DOTPROD, ARM_HWCAP_A64_ASIMDDP); GET_FEATURE(ARM_FEATURE_V8_FCMA, ARM_HWCAP_A64_FCMA); + GET_FEATURE(ARM_FEATURE_SVE, ARM_HWCAP_A64_SVE); #undef GET_FEATURE return hwcaps; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 2ae4fffafb..82ff450f9a 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -164,6 +164,13 @@ static void arm_cpu_reset(CPUState *s) env->cp15.sctlr_el[1] |= SCTLR_UCT | SCTLR_UCI | SCTLR_DZE; /* and to the FP/Neon instructions */ env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 20, 2, 3); + /* and to the SVE instructions */ + env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 16, 2, 3); + env->cp15.cptr_el[3] |= CPTR_EZ; + /* with maximum vector length */ + env->vfp.zcr_el[1] = ARM_MAX_VQ - 1; + env->vfp.zcr_el[2] = ARM_MAX_VQ - 1; + env->vfp.zcr_el[3] = ARM_MAX_VQ - 1; #else /* Reset into the highest available EL */ if (arm_feature(env, ARM_FEATURE_EL3)) { @@ -793,9 +800,20 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) /* Some features automatically imply others: */ if (arm_feature(env, ARM_FEATURE_V8)) { - set_feature(env, ARM_FEATURE_V7); + set_feature(env, ARM_FEATURE_V7VE); + } + if (arm_feature(env, ARM_FEATURE_V7VE)) { + /* v7 Virtualization Extensions. In real hardware this implies + * EL2 and also the presence of the Security Extensions. + * For QEMU, for backwards-compatibility we implement some + * CPUs or CPU configs which have no actual EL2 or EL3 but do + * include the various other features that V7VE implies. + * Presence of EL2 itself is ARM_FEATURE_EL2, and of the + * Security Extensions is ARM_FEATURE_EL3. + */ set_feature(env, ARM_FEATURE_ARM_DIV); set_feature(env, ARM_FEATURE_LPAE); + set_feature(env, ARM_FEATURE_V7); } if (arm_feature(env, ARM_FEATURE_V7)) { set_feature(env, ARM_FEATURE_VAPA); @@ -1255,6 +1273,7 @@ static void cortex_m3_initfn(Object *obj) cpu->id_isar3 = 0x01111110; cpu->id_isar4 = 0x01310102; cpu->id_isar5 = 0x00000000; + cpu->id_isar6 = 0x00000000; } static void cortex_m4_initfn(Object *obj) @@ -1281,6 +1300,7 @@ static void cortex_m4_initfn(Object *obj) cpu->id_isar3 = 0x01111110; cpu->id_isar4 = 0x01310102; cpu->id_isar5 = 0x00000000; + cpu->id_isar6 = 0x00000000; } static void cortex_m33_initfn(Object *obj) @@ -1309,6 +1329,7 @@ static void cortex_m33_initfn(Object *obj) cpu->id_isar3 = 0x01111131; cpu->id_isar4 = 0x01310132; cpu->id_isar5 = 0x00000000; + cpu->id_isar6 = 0x00000000; cpu->clidr = 0x00000000; cpu->ctr = 0x8000c000; } @@ -1359,6 +1380,7 @@ static void cortex_r5_initfn(Object *obj) cpu->id_isar3 = 0x01112131; cpu->id_isar4 = 0x0010142; cpu->id_isar5 = 0x0; + cpu->id_isar6 = 0x0; cpu->mp_is_up = true; cpu->pmsav7_dregion = 16; define_arm_cp_regs(cpu, cortexr5_cp_reginfo); @@ -1517,15 +1539,13 @@ static void cortex_a7_initfn(Object *obj) ARMCPU *cpu = ARM_CPU(obj); cpu->dtb_compatible = "arm,cortex-a7"; - set_feature(&cpu->env, ARM_FEATURE_V7); + set_feature(&cpu->env, ARM_FEATURE_V7VE); set_feature(&cpu->env, ARM_FEATURE_VFP4); set_feature(&cpu->env, ARM_FEATURE_NEON); set_feature(&cpu->env, ARM_FEATURE_THUMB2EE); - set_feature(&cpu->env, ARM_FEATURE_ARM_DIV); set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER); set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS); set_feature(&cpu->env, ARM_FEATURE_CBAR_RO); - set_feature(&cpu->env, ARM_FEATURE_LPAE); set_feature(&cpu->env, ARM_FEATURE_EL3); cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A7; cpu->midr = 0x410fc075; @@ -1562,15 +1582,13 @@ static void cortex_a15_initfn(Object *obj) ARMCPU *cpu = ARM_CPU(obj); cpu->dtb_compatible = "arm,cortex-a15"; - set_feature(&cpu->env, ARM_FEATURE_V7); + set_feature(&cpu->env, ARM_FEATURE_V7VE); set_feature(&cpu->env, ARM_FEATURE_VFP4); set_feature(&cpu->env, ARM_FEATURE_NEON); set_feature(&cpu->env, ARM_FEATURE_THUMB2EE); - set_feature(&cpu->env, ARM_FEATURE_ARM_DIV); set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER); set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS); set_feature(&cpu->env, ARM_FEATURE_CBAR_RO); - set_feature(&cpu->env, ARM_FEATURE_LPAE); set_feature(&cpu->env, ARM_FEATURE_EL3); cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A15; cpu->midr = 0x412fc0f1; @@ -1789,15 +1807,13 @@ static void arm_max_initfn(Object *obj) * since we don't correctly set the ID registers to advertise them, */ set_feature(&cpu->env, ARM_FEATURE_V8); - set_feature(&cpu->env, ARM_FEATURE_VFP4); - set_feature(&cpu->env, ARM_FEATURE_NEON); - set_feature(&cpu->env, ARM_FEATURE_THUMB2EE); set_feature(&cpu->env, ARM_FEATURE_V8_AES); set_feature(&cpu->env, ARM_FEATURE_V8_SHA1); set_feature(&cpu->env, ARM_FEATURE_V8_SHA256); set_feature(&cpu->env, ARM_FEATURE_V8_PMULL); set_feature(&cpu->env, ARM_FEATURE_CRC); set_feature(&cpu->env, ARM_FEATURE_V8_RDM); + set_feature(&cpu->env, ARM_FEATURE_V8_DOTPROD); set_feature(&cpu->env, ARM_FEATURE_V8_FCMA); #endif } diff --git a/target/arm/cpu.h b/target/arm/cpu.h index a4507a2d6f..e310ffc29d 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -813,6 +813,7 @@ struct ARMCPU { uint32_t id_isar3; uint32_t id_isar4; uint32_t id_isar5; + uint32_t id_isar6; uint64_t id_aa64pfr0; uint64_t id_aa64pfr1; uint64_t id_aa64dfr0; @@ -1442,6 +1443,7 @@ enum arm_features { ARM_FEATURE_OMAPCP, /* OMAP specific CP15 ops handling. */ ARM_FEATURE_THUMB2EE, ARM_FEATURE_V7MP, /* v7 Multiprocessing Extensions */ + ARM_FEATURE_V7VE, /* v7 Virtualization Extensions (non-EL2 parts) */ ARM_FEATURE_V4T, ARM_FEATURE_V5, ARM_FEATURE_STRONGARM, @@ -1480,6 +1482,7 @@ enum arm_features { ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */ ARM_FEATURE_V8_ATOMICS, /* ARMv8.1-Atomics feature */ ARM_FEATURE_V8_RDM, /* implements v8.1 simd round multiply */ + ARM_FEATURE_V8_DOTPROD, /* implements v8.2 simd dot product */ ARM_FEATURE_V8_FP16, /* implements v8.2 half-precision float */ ARM_FEATURE_V8_FCMA, /* has complex number part of v8.3 extensions. */ ARM_FEATURE_M_MAIN, /* M profile Main Extension */ diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index c50dcd4077..d0581d59d8 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -139,6 +139,7 @@ static void aarch64_a57_initfn(Object *obj) cpu->id_isar3 = 0x01112131; cpu->id_isar4 = 0x00011142; cpu->id_isar5 = 0x00011121; + cpu->id_isar6 = 0; cpu->id_aa64pfr0 = 0x00002222; cpu->id_aa64dfr0 = 0x10305106; cpu->pmceid0 = 0x00000000; @@ -199,6 +200,7 @@ static void aarch64_a53_initfn(Object *obj) cpu->id_isar3 = 0x01112131; cpu->id_isar4 = 0x00011142; cpu->id_isar5 = 0x00011121; + cpu->id_isar6 = 0; cpu->id_aa64pfr0 = 0x00002222; cpu->id_aa64dfr0 = 0x10305106; cpu->id_aa64isar0 = 0x00011120; @@ -235,23 +237,16 @@ static void aarch64_max_initfn(Object *obj) * whereas the architecture requires them to be present in both if * present in either. */ - set_feature(&cpu->env, ARM_FEATURE_V8); - set_feature(&cpu->env, ARM_FEATURE_VFP4); - set_feature(&cpu->env, ARM_FEATURE_NEON); - set_feature(&cpu->env, ARM_FEATURE_AARCH64); - set_feature(&cpu->env, ARM_FEATURE_V8_AES); - set_feature(&cpu->env, ARM_FEATURE_V8_SHA1); - set_feature(&cpu->env, ARM_FEATURE_V8_SHA256); set_feature(&cpu->env, ARM_FEATURE_V8_SHA512); set_feature(&cpu->env, ARM_FEATURE_V8_SHA3); set_feature(&cpu->env, ARM_FEATURE_V8_SM3); set_feature(&cpu->env, ARM_FEATURE_V8_SM4); - set_feature(&cpu->env, ARM_FEATURE_V8_PMULL); - set_feature(&cpu->env, ARM_FEATURE_CRC); set_feature(&cpu->env, ARM_FEATURE_V8_ATOMICS); set_feature(&cpu->env, ARM_FEATURE_V8_RDM); + set_feature(&cpu->env, ARM_FEATURE_V8_DOTPROD); set_feature(&cpu->env, ARM_FEATURE_V8_FP16); set_feature(&cpu->env, ARM_FEATURE_V8_FCMA); + set_feature(&cpu->env, ARM_FEATURE_SVE); /* For usermode -cpu max we can use a larger and more efficient DCZ * blocksize since we don't have to follow what the hardware does. */ diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index 2e76084992..023952a9a4 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -274,6 +274,11 @@ DEF_HELPER_FLAGS_3(sve_clr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32) DEF_HELPER_FLAGS_3(sve_clr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32) DEF_HELPER_FLAGS_3(sve_clr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_movz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_movz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_movz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_movz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve_asr_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_asr_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_asr_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -719,3 +724,680 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_faddv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_faddv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_faddv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_fmaxnmv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fmaxnmv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fmaxnmv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_fminnmv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fminnmv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fminnmv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_fmaxv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fmaxv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fmaxv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_fminv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fminv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_fminv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG, + i64, i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG, + i64, i64, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fadda_d, TCG_CALL_NO_RWG, + i64, i64, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmge0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmge0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmge0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmgt0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmgt0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmgt0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmlt0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmlt0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmlt0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmle0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmle0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmle0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmeq0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmeq0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmeq0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcmne0_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmne0_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcmne0_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fsub_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsub_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsub_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmul_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmul_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmul_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fdiv_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fdiv_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fdiv_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmin_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmin_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmin_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmax_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmax_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fminnum_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fminnum_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmaxnum_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxnum_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxnum_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fabd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fabd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fabd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fscalbn_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fscalbn_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fscalbn_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmulx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmulx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmulx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fadds_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fadds_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fadds_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fsubs_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsubs_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsubs_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmuls_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmuls_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmuls_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fsubrs_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsubrs_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fsubrs_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmaxnms_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxnms_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxnms_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fminnms_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fminnms_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fminnms_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmaxs_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxs_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmaxs_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fmins_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmins_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fmins_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcvt_sh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvt_dh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvt_hs, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvt_ds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvt_hd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvt_sd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcvtzs_hh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_hs, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_ss, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_ds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_hd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_sd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzs_dd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fcvtzu_hh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_hs, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_ss, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_ds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_hd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_sd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fcvtzu_dd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_frint_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frint_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frint_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_frintx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frintx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frintx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_frecpx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frecpx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_frecpx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_fsqrt_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fsqrt_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_fsqrt_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_dh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_ss, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_sd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_ds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_scvt_dd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_ucvt_hh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_sh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_dh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_ss, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_sd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_ds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ucvt_dd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcmge_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmge_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmge_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcmgt_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmgt_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmgt_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcmeq_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmeq_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmeq_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcmne_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmne_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmne_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcmuo_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmuo_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcmuo_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_facge_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_facge_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_facge_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_facgt_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_facgt_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_facgt_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(sve_fcadd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcadd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(sve_fcadd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32) +DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_ftmad_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ftmad_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_ftmad_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1hs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1hd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1sd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldssu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbss_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbsu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldssu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbss_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldddu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldddu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldddu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffssu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbss_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbsu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffssu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbss_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffddu_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffddu_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffddu_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbs_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_zsu, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_zss, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_zd, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) diff --git a/target/arm/helper.c b/target/arm/helper.c index 3c6a4c565b..a2ac96084e 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -1404,7 +1404,7 @@ static const ARMCPRegInfo v7_cp_reginfo[] = { .writefn = pmuserenr_write, .raw_writefn = raw_write }, { .name = "PMINTENSET", .cp = 15, .crn = 9, .crm = 14, .opc1 = 0, .opc2 = 1, .access = PL1_RW, .accessfn = access_tpm, - .type = ARM_CP_ALIAS, + .type = ARM_CP_ALIAS | ARM_CP_IO, .fieldoffset = offsetoflow32(CPUARMState, cp15.c9_pminten), .resetvalue = 0, .writefn = pmintenset_write, .raw_writefn = raw_write }, @@ -2167,11 +2167,32 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = { }; #else -/* In user-mode none of the generic timer registers are accessible, - * and their implementation depends on QEMU_CLOCK_VIRTUAL and qdev gpio outputs, - * so instead just don't register any of them. + +/* In user-mode most of the generic timer registers are inaccessible + * however modern kernels (4.12+) allow access to cntvct_el0 */ + +static uint64_t gt_virt_cnt_read(CPUARMState *env, const ARMCPRegInfo *ri) +{ + /* Currently we have no support for QEMUTimer in linux-user so we + * can't call gt_get_countervalue(env), instead we directly + * call the lower level functions. + */ + return cpu_get_clock() / GTIMER_SCALE; +} + static const ARMCPRegInfo generic_timer_cp_reginfo[] = { + { .name = "CNTFRQ_EL0", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 0, + .type = ARM_CP_CONST, .access = PL0_R /* no PL1_RW in linux-user */, + .fieldoffset = offsetof(CPUARMState, cp15.c14_cntfrq), + .resetvalue = NANOSECONDS_PER_SECOND / GTIMER_SCALE, + }, + { .name = "CNTVCT_EL0", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 2, + .access = PL0_R, .type = ARM_CP_NO_RAW | ARM_CP_IO, + .readfn = gt_virt_cnt_read, + }, REGINFO_SENTINEL }; @@ -4393,7 +4414,7 @@ static void zcr_write(CPUARMState *env, const ARMCPRegInfo *ri, static const ARMCPRegInfo zcr_el1_reginfo = { .name = "ZCR_EL1", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 2, .opc2 = 0, - .access = PL1_RW, .type = ARM_CP_SVE | ARM_CP_FPU, + .access = PL1_RW, .type = ARM_CP_SVE, .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[1]), .writefn = zcr_write, .raw_writefn = raw_write }; @@ -4401,7 +4422,7 @@ static const ARMCPRegInfo zcr_el1_reginfo = { static const ARMCPRegInfo zcr_el2_reginfo = { .name = "ZCR_EL2", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0, - .access = PL2_RW, .type = ARM_CP_SVE | ARM_CP_FPU, + .access = PL2_RW, .type = ARM_CP_SVE, .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[2]), .writefn = zcr_write, .raw_writefn = raw_write }; @@ -4409,14 +4430,14 @@ static const ARMCPRegInfo zcr_el2_reginfo = { static const ARMCPRegInfo zcr_no_el2_reginfo = { .name = "ZCR_EL2", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0, - .access = PL2_RW, .type = ARM_CP_SVE | ARM_CP_FPU, + .access = PL2_RW, .type = ARM_CP_SVE, .readfn = arm_cp_read_zero, .writefn = arm_cp_write_ignore }; static const ARMCPRegInfo zcr_el3_reginfo = { .name = "ZCR_EL3", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 2, .opc2 = 0, - .access = PL3_RW, .type = ARM_CP_SVE | ARM_CP_FPU, + .access = PL3_RW, .type = ARM_CP_SVE, .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[3]), .writefn = zcr_write, .raw_writefn = raw_write }; @@ -4851,11 +4872,10 @@ void register_cp_regs_for_features(ARMCPU *cpu) .opc0 = 3, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 6, .access = PL1_R, .type = ARM_CP_CONST, .resetvalue = cpu->id_mmfr4 }, - /* 7 is as yet unallocated and must RAZ */ - { .name = "ID_ISAR7_RESERVED", .state = ARM_CP_STATE_BOTH, + { .name = "ID_ISAR6", .state = ARM_CP_STATE_BOTH, .opc0 = 3, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 7, .access = PL1_R, .type = ARM_CP_CONST, - .resetvalue = 0 }, + .resetvalue = cpu->id_isar6 }, REGINFO_SENTINEL }; define_arm_cp_regs(cpu, v6_idregs); @@ -11407,7 +11427,7 @@ ftype HELPER(name)(uint32_t x, void *fpstp) \ } #define CONV_FTOI(name, ftype, fsz, sign, round) \ -uint32_t HELPER(name)(ftype x, void *fpstp) \ +sign##int32_t HELPER(name)(ftype x, void *fpstp) \ { \ float_status *fpst = fpstp; \ if (float##fsz##_is_any_nan(x)) { \ diff --git a/target/arm/helper.h b/target/arm/helper.h index 879a7229e9..59e8c3bd1b 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -134,12 +134,12 @@ DEF_HELPER_2(vfp_touid, i32, f64, ptr) DEF_HELPER_2(vfp_touizh, i32, f16, ptr) DEF_HELPER_2(vfp_touizs, i32, f32, ptr) DEF_HELPER_2(vfp_touizd, i32, f64, ptr) -DEF_HELPER_2(vfp_tosih, i32, f16, ptr) -DEF_HELPER_2(vfp_tosis, i32, f32, ptr) -DEF_HELPER_2(vfp_tosid, i32, f64, ptr) -DEF_HELPER_2(vfp_tosizh, i32, f16, ptr) -DEF_HELPER_2(vfp_tosizs, i32, f32, ptr) -DEF_HELPER_2(vfp_tosizd, i32, f64, ptr) +DEF_HELPER_2(vfp_tosih, s32, f16, ptr) +DEF_HELPER_2(vfp_tosis, s32, f32, ptr) +DEF_HELPER_2(vfp_tosid, s32, f64, ptr) +DEF_HELPER_2(vfp_tosizh, s32, f16, ptr) +DEF_HELPER_2(vfp_tosizs, s32, f32, ptr) +DEF_HELPER_2(vfp_tosizd, s32, f64, ptr) DEF_HELPER_3(vfp_toshs_round_to_zero, i32, f32, i32, ptr) DEF_HELPER_3(vfp_tosls_round_to_zero, i32, f32, i32, ptr) @@ -583,6 +583,16 @@ DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sdot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_udot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sdot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_udot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_fcadds, TCG_CALL_NO_RWG, @@ -601,6 +611,14 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) @@ -620,6 +638,20 @@ DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, i32) + #ifdef TARGET_AARCH64 #include "helper-a64.h" #include "helper-sve.h" diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c index 1740cda47d..4e91c11796 100644 --- a/target/arm/kvm32.c +++ b/target/arm/kvm32.c @@ -36,7 +36,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) * and then query that CPU for the relevant ID registers. */ int i, ret, fdarray[3]; - uint32_t midr, id_pfr0, id_isar0, mvfr1; + uint32_t midr, id_pfr0, mvfr1; uint64_t features = 0; /* Old kernels may not know about the PREFERRED_TARGET ioctl: however * we know these will only support creating one kind of guest CPU, @@ -60,11 +60,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) }, { .id = KVM_REG_ARM | KVM_REG_SIZE_U32 - | ENCODE_CP_REG(15, 0, 0, 0, 2, 0, 0), - .addr = (uintptr_t)&id_isar0, - }, - { - .id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | KVM_REG_ARM_VFP_MVFR1, .addr = (uintptr_t)&mvfr1, }, @@ -98,26 +93,14 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) /* Now we've retrieved all the register information we can * set the feature bits based on the ID register fields. * We can assume any KVM supporting CPU is at least a v7 - * with VFPv3, LPAE and the generic timers; this in turn implies - * most of the other feature bits, but a few must be tested. + * with VFPv3, virtualization extensions, and the generic + * timers; this in turn implies most of the other feature + * bits, but a few must be tested. */ - set_feature(&features, ARM_FEATURE_V7); + set_feature(&features, ARM_FEATURE_V7VE); set_feature(&features, ARM_FEATURE_VFP3); - set_feature(&features, ARM_FEATURE_LPAE); set_feature(&features, ARM_FEATURE_GENERIC_TIMER); - switch (extract32(id_isar0, 24, 4)) { - case 1: - set_feature(&features, ARM_FEATURE_THUMB_DIV); - break; - case 2: - set_feature(&features, ARM_FEATURE_ARM_DIV); - set_feature(&features, ARM_FEATURE_THUMB_DIV); - break; - default: - break; - } - if (extract32(id_pfr0, 12, 4) == 1) { set_feature(&features, ARM_FEATURE_THUMB2EE); } diff --git a/target/arm/sve.decode b/target/arm/sve.decode index 6f436f9096..e10b689454 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -27,6 +27,9 @@ %imm7_22_16 22:2 16:5 %imm8_16_10 16:5 10:3 %imm9_16_10 16:s6 10:3 +%size_23 23:2 +%dtype_23_13 23:2 13:2 +%index3_22_19 22:1 19:2 # A combination of tsz:imm3 -- extract esize. %tszimm_esz 22:2 5:5 !function=tszimm_esz @@ -45,6 +48,9 @@ # Unsigned 8-bit immediate, optionally shifted left by 8. %sh8_i8u 5:9 !function=expand_imm_sh8u +# Unsigned load of msz into esz=2, represented as a dtype. +%msz_dtype 23:2 !function=msz_dtype + # Either a copy of rd (at bit 0), or a different source # as propagated via the MOVPRFX instruction. %reg_movprfx 0:5 @@ -71,6 +77,14 @@ &incdec2_cnt rd rn pat esz imm d u &incdec_pred rd pg esz d u &incdec2_pred rd rn pg esz d u +&rprr_load rd pg rn rm dtype nreg +&rpri_load rd pg rn imm dtype nreg +&rprr_store rd pg rn rm msz esz nreg +&rpri_store rd pg rn imm msz esz nreg +&rprr_gather_load rd pg rn rm esz msz u ff xs scale +&rpri_gather_load rd pg rn imm esz msz u ff +&rprr_scatter_store rd pg rn rm esz msz xs scale +&rpri_scatter_store rd pg rn imm esz msz ########################################################################### # Named instruction formats. These are generally used to @@ -120,10 +134,16 @@ &rprrr_esz ra=%reg_movprfx @rdn_pg_ra_rm ........ esz:2 . rm:5 ... pg:3 ra:5 rd:5 \ &rprrr_esz rn=%reg_movprfx +@rdn_pg_rm_ra ........ esz:2 . ra:5 ... pg:3 rm:5 rd:5 \ + &rprrr_esz rn=%reg_movprfx # One register operand, with governing predicate, vector element size @rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz @rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz +@pd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 . rd:4 &rpr_esz + +# One register operand, with governing predicate, no vector element size +@rd_pg_rn_e0 ........ .. ... ... ... pg:3 rn:5 rd:5 &rpr_esz esz=0 # Two register operands with a 6-bit signed immediate. @rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri @@ -142,6 +162,10 @@ @rdn_pg4 ........ esz:2 .. pg:4 ... ........ rd:5 \ &rpri_esz rn=%reg_movprfx +# Two register operand, one one-bit floating-point operand. +@rdn_i1 ........ esz:2 ......... pg:3 .... imm:1 rd:5 \ + &rpri_esz rn=%reg_movprfx + # Two register operand, one encoded bitmask. @rdn_dbm ........ .. .... dbm:13 rd:5 \ &rr_dbm rn=%reg_movprfx @@ -170,6 +194,41 @@ @incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \ &incdec2_pred rn=%reg_movprfx +# Loads; user must fill in NREG. +@rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load +@rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load + +@rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_load dtype=%msz_dtype +@rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \ + &rpri_load dtype=%msz_dtype + +# Gather Loads. +@rprr_g_load_u ....... .. . . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rprr_g_load_xs_u ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_xs_u_sc ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_xs_sc ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load +@rprr_g_load_u_sc ....... .. . scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rprr_g_load_sc ....... .. . scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \ + &rprr_gather_load xs=2 +@rpri_g_load ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \ + &rpri_gather_load + +# Stores; user must fill in ESZ, MSZ, NREG as needed. +@rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store +@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store +@rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_store nreg=0 +@rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \ + &rprr_scatter_store +@rpri_scatter_store ....... msz:2 .. imm:5 ... pg:3 rn:5 rd:5 \ + &rpri_scatter_store + ########################################################################### # Instruction patterns. Grouped according to the SVE encodingindex.xhtml. @@ -211,6 +270,10 @@ ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn +# SVE constructive prefix (predicated) +MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn +MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn + # SVE integer add reduction (predicated) # Note that saddv requires size != 3. UADDV 00000100 .. 000 001 001 ... ..... ..... @rd_pg_rn @@ -271,6 +334,17 @@ UXTH 00000100 .. 010 011 101 ... ..... ..... @rd_pg_rn SXTW 00000100 .. 010 100 101 ... ..... ..... @rd_pg_rn UXTW 00000100 .. 010 101 101 ... ..... ..... @rd_pg_rn +### SVE Floating Point Compare - Vectors Group + +# SVE floating-point compare vectors +FCMGE_ppzz 01100101 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm +FCMGT_ppzz 01100101 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm +FCMEQ_ppzz 01100101 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm +FCMNE_ppzz 01100101 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm +FCMUO_ppzz 01100101 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm +FACGE_ppzz 01100101 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm +FACGT_ppzz 01100101 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm + ### SVE Integer Multiply-Add Group # SVE integer multiply-add writing addend (predicated) @@ -348,6 +422,9 @@ ADR_p64 00000100 11 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm ### SVE Integer Misc - Unpredicated Group +# SVE constructive prefix (unpredicated) +MOVPRFX 00000100 00 1 00000 101111 rn:5 rd:5 + # SVE floating-point exponential accelerator # Note esz != 0 FEXPA 00000100 .. 1 00000 101110 ..... ..... @rd_rn @@ -648,6 +725,74 @@ UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u # SVE integer multiply immediate (unpredicated) MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s +# SVE integer dot product (unpredicated) +DOT_zzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 ra=%reg_movprfx + +# SVE integer dot product (indexed) +DOT_zzx 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \ + sz=0 ra=%reg_movprfx +DOT_zzx 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \ + sz=1 ra=%reg_movprfx + +# SVE floating-point complex add (predicated) +FCADD 01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \ + rn=%reg_movprfx + +# SVE floating-point complex multiply-add (predicated) +FCMLA_zpzzz 01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \ + ra=%reg_movprfx + +# SVE floating-point complex multiply-add (indexed) +FCMLA_zzxz 01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx esz=1 +FCMLA_zzxz 01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \ + ra=%reg_movprfx esz=2 + +### SVE FP Multiply-Add Indexed Group + +# SVE floating-point multiply-add (indexed) +FMLA_zzxz 01100100 0.1 .. rm:3 00000 sub:1 rn:5 rd:5 \ + ra=%reg_movprfx index=%index3_22_19 esz=1 +FMLA_zzxz 01100100 101 index:2 rm:3 00000 sub:1 rn:5 rd:5 \ + ra=%reg_movprfx esz=2 +FMLA_zzxz 01100100 111 index:1 rm:4 00000 sub:1 rn:5 rd:5 \ + ra=%reg_movprfx esz=3 + +### SVE FP Multiply Indexed Group + +# SVE floating-point multiply (indexed) +FMUL_zzx 01100100 0.1 .. rm:3 001000 rn:5 rd:5 \ + index=%index3_22_19 esz=1 +FMUL_zzx 01100100 101 index:2 rm:3 001000 rn:5 rd:5 esz=2 +FMUL_zzx 01100100 111 index:1 rm:4 001000 rn:5 rd:5 esz=3 + +### SVE FP Fast Reduction Group + +FADDV 01100101 .. 000 000 001 ... ..... ..... @rd_pg_rn +FMAXNMV 01100101 .. 000 100 001 ... ..... ..... @rd_pg_rn +FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn +FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn +FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn + +## SVE Floating Point Unary Operations - Unpredicated Group + +FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn +FRSQRTE 01100101 .. 001 111 001100 ..... ..... @rd_rn + +### SVE FP Compare with Zero Group + +FCMGE_ppz0 01100101 .. 0100 00 001 ... ..... 0 .... @pd_pg_rn +FCMGT_ppz0 01100101 .. 0100 00 001 ... ..... 1 .... @pd_pg_rn +FCMLT_ppz0 01100101 .. 0100 01 001 ... ..... 0 .... @pd_pg_rn +FCMLE_ppz0 01100101 .. 0100 01 001 ... ..... 1 .... @pd_pg_rn +FCMEQ_ppz0 01100101 .. 0100 10 001 ... ..... 0 .... @pd_pg_rn +FCMNE_ppz0 01100101 .. 0100 11 001 ... ..... 0 .... @pd_pg_rn + +### SVE FP Accumulating Reduction Group + +# SVE floating-point serial reduction (predicated) +FADDA 01100101 .. 011 000 001 ... ..... ..... @rdn_pg_rm + ### SVE Floating Point Arithmetic - Unpredicated Group # SVE floating-point arithmetic (unpredicated) @@ -658,6 +803,108 @@ FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm +### SVE FP Arithmetic Predicated Group + +# SVE floating-point arithmetic (predicated) +FADD_zpzz 01100101 .. 00 0000 100 ... ..... ..... @rdn_pg_rm +FSUB_zpzz 01100101 .. 00 0001 100 ... ..... ..... @rdn_pg_rm +FMUL_zpzz 01100101 .. 00 0010 100 ... ..... ..... @rdn_pg_rm +FSUB_zpzz 01100101 .. 00 0011 100 ... ..... ..... @rdm_pg_rn # FSUBR +FMAXNM_zpzz 01100101 .. 00 0100 100 ... ..... ..... @rdn_pg_rm +FMINNM_zpzz 01100101 .. 00 0101 100 ... ..... ..... @rdn_pg_rm +FMAX_zpzz 01100101 .. 00 0110 100 ... ..... ..... @rdn_pg_rm +FMIN_zpzz 01100101 .. 00 0111 100 ... ..... ..... @rdn_pg_rm +FABD 01100101 .. 00 1000 100 ... ..... ..... @rdn_pg_rm +FSCALE 01100101 .. 00 1001 100 ... ..... ..... @rdn_pg_rm +FMULX 01100101 .. 00 1010 100 ... ..... ..... @rdn_pg_rm +FDIV 01100101 .. 00 1100 100 ... ..... ..... @rdm_pg_rn # FDIVR +FDIV 01100101 .. 00 1101 100 ... ..... ..... @rdn_pg_rm + +# SVE floating-point arithmetic with immediate (predicated) +FADD_zpzi 01100101 .. 011 000 100 ... 0000 . ..... @rdn_i1 +FSUB_zpzi 01100101 .. 011 001 100 ... 0000 . ..... @rdn_i1 +FMUL_zpzi 01100101 .. 011 010 100 ... 0000 . ..... @rdn_i1 +FSUBR_zpzi 01100101 .. 011 011 100 ... 0000 . ..... @rdn_i1 +FMAXNM_zpzi 01100101 .. 011 100 100 ... 0000 . ..... @rdn_i1 +FMINNM_zpzi 01100101 .. 011 101 100 ... 0000 . ..... @rdn_i1 +FMAX_zpzi 01100101 .. 011 110 100 ... 0000 . ..... @rdn_i1 +FMIN_zpzi 01100101 .. 011 111 100 ... 0000 . ..... @rdn_i1 + +# SVE floating-point trig multiply-add coefficient +FTMAD 01100101 esz:2 010 imm:3 100000 rm:5 rd:5 rn=%reg_movprfx + +### SVE FP Multiply-Add Group + +# SVE floating-point multiply-accumulate writing addend +FMLA_zpzzz 01100101 .. 1 ..... 000 ... ..... ..... @rda_pg_rn_rm +FMLS_zpzzz 01100101 .. 1 ..... 001 ... ..... ..... @rda_pg_rn_rm +FNMLA_zpzzz 01100101 .. 1 ..... 010 ... ..... ..... @rda_pg_rn_rm +FNMLS_zpzzz 01100101 .. 1 ..... 011 ... ..... ..... @rda_pg_rn_rm + +# SVE floating-point multiply-accumulate writing multiplicand +# Alter the operand extraction order and reuse the helpers from above. +# FMAD, FMSB, FNMAD, FNMS +FMLA_zpzzz 01100101 .. 1 ..... 100 ... ..... ..... @rdn_pg_rm_ra +FMLS_zpzzz 01100101 .. 1 ..... 101 ... ..... ..... @rdn_pg_rm_ra +FNMLA_zpzzz 01100101 .. 1 ..... 110 ... ..... ..... @rdn_pg_rm_ra +FNMLS_zpzzz 01100101 .. 1 ..... 111 ... ..... ..... @rdn_pg_rm_ra + +### SVE FP Unary Operations Predicated Group + +# SVE floating-point convert precision +FCVT_sh 01100101 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_hs 01100101 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_dh 01100101 11 0010 00 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_hd 01100101 11 0010 01 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_ds 01100101 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0 +FCVT_sd 01100101 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0 + +# SVE floating-point convert to integer +FCVTZS_hh 01100101 01 011 01 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hh 01100101 01 011 01 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_hs 01100101 01 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hs 01100101 01 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_hd 01100101 01 011 11 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_hd 01100101 01 011 11 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_ss 01100101 10 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_ss 01100101 10 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_ds 01100101 11 011 00 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_ds 01100101 11 011 00 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_sd 01100101 11 011 10 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_sd 01100101 11 011 10 1 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZS_dd 01100101 11 011 11 0 101 ... ..... ..... @rd_pg_rn_e0 +FCVTZU_dd 01100101 11 011 11 1 101 ... ..... ..... @rd_pg_rn_e0 + +# SVE floating-point round to integral value +FRINTN 01100101 .. 000 000 101 ... ..... ..... @rd_pg_rn +FRINTP 01100101 .. 000 001 101 ... ..... ..... @rd_pg_rn +FRINTM 01100101 .. 000 010 101 ... ..... ..... @rd_pg_rn +FRINTZ 01100101 .. 000 011 101 ... ..... ..... @rd_pg_rn +FRINTA 01100101 .. 000 100 101 ... ..... ..... @rd_pg_rn +FRINTX 01100101 .. 000 110 101 ... ..... ..... @rd_pg_rn +FRINTI 01100101 .. 000 111 101 ... ..... ..... @rd_pg_rn + +# SVE floating-point unary operations +FRECPX 01100101 .. 001 100 101 ... ..... ..... @rd_pg_rn +FSQRT 01100101 .. 001 101 101 ... ..... ..... @rd_pg_rn + +# SVE integer convert to floating-point +SCVTF_hh 01100101 01 010 01 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_sh 01100101 01 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_dh 01100101 01 010 11 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_ss 01100101 10 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_sd 01100101 11 010 00 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_ds 01100101 11 010 10 0 101 ... ..... ..... @rd_pg_rn_e0 +SCVTF_dd 01100101 11 010 11 0 101 ... ..... ..... @rd_pg_rn_e0 + +UCVTF_hh 01100101 01 010 01 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_sh 01100101 01 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_dh 01100101 01 010 11 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_ss 01100101 10 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_sd 01100101 11 010 00 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_ds 01100101 11 010 10 1 101 ... ..... ..... @rd_pg_rn_e0 +UCVTF_dd 01100101 11 010 11 1 101 ... ..... ..... @rd_pg_rn_e0 + ### SVE Memory - 32-bit Gather and Unsized Contiguous Group # SVE load predicate register @@ -665,3 +912,183 @@ LDR_pri 10000101 10 ...... 000 ... ..... 0 .... @pd_rn_i9 # SVE load vector register LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9 + +# SVE load and broadcast element +LD1R_zpri 1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \ + &rpri_load dtype=%dtype_23_13 nreg=0 + +# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets) +# SVE 32-bit gather load (scalar plus 32-bit scaled offsets) +LD1_zprz 1000010 00 .0 ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u esz=2 msz=0 scale=0 +LD1_zprz 1000010 01 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=2 msz=1 +LD1_zprz 1000010 10 .. ..... 01. ... ..... ..... \ + @rprr_g_load_xs_sc esz=2 msz=2 u=1 + +# SVE 32-bit gather load (vector plus immediate) +LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \ + @rpri_g_load esz=2 + +### SVE Memory Contiguous Load Group + +# SVE contiguous load (scalar plus scalar) +LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0 + +# SVE contiguous first-fault load (scalar plus scalar) +LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0 + +# SVE contiguous load (scalar plus immediate) +LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0 + +# SVE contiguous non-fault load (scalar plus immediate) +LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0 + +# SVE contiguous non-temporal load (scalar plus scalar) +# LDNT1B, LDNT1H, LDNT1W, LDNT1D +# SVE load multiple structures (scalar plus scalar) +# LD2B, LD2H, LD2W, LD2D; etc. +LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz + +# SVE contiguous non-temporal load (scalar plus immediate) +# LDNT1B, LDNT1H, LDNT1W, LDNT1D +# SVE load multiple structures (scalar plus immediate) +# LD2B, LD2H, LD2W, LD2D; etc. +LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz + +# SVE load and broadcast quadword (scalar plus scalar) +LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \ + @rprr_load_msz nreg=0 + +# SVE load and broadcast quadword (scalar plus immediate) +# LD1RQB, LD1RQH, LD1RQS, LD1RQD +LD1RQ_zpri 1010010 .. 00 0.... 001 ... ..... ..... \ + @rpri_load_msz nreg=0 + +# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets) +PRF 1000010 00 -1 ----- 0-- --- ----- 0 ---- + +# SVE 32-bit gather prefetch (vector plus immediate) +PRF 1000010 -- 00 ----- 111 --- ----- 0 ---- + +# SVE contiguous prefetch (scalar plus immediate) +PRF 1000010 11 1- ----- 0-- --- ----- 0 ---- + +# SVE contiguous prefetch (scalar plus scalar) +PRF_rr 1000010 -- 00 rm:5 110 --- ----- 0 ---- + +### SVE Memory 64-bit Gather Group + +# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets) +# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets) +LD1_zprz 1100010 00 .0 ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u esz=3 msz=0 scale=0 +LD1_zprz 1100010 01 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=3 msz=1 +LD1_zprz 1100010 10 .. ..... 0.. ... ..... ..... \ + @rprr_g_load_xs_u_sc esz=3 msz=2 +LD1_zprz 1100010 11 .. ..... 01. ... ..... ..... \ + @rprr_g_load_xs_sc esz=3 msz=3 u=1 + +# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets) +# SVE 64-bit gather load (scalar plus 64-bit scaled offsets) +LD1_zprz 1100010 00 10 ..... 1.. ... ..... ..... \ + @rprr_g_load_u esz=3 msz=0 scale=0 +LD1_zprz 1100010 01 1. ..... 1.. ... ..... ..... \ + @rprr_g_load_u_sc esz=3 msz=1 +LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \ + @rprr_g_load_u_sc esz=3 msz=2 +LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \ + @rprr_g_load_sc esz=3 msz=3 u=1 + +# SVE 64-bit gather load (vector plus immediate) +LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \ + @rpri_g_load esz=3 + +# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets) +PRF 1100010 00 11 ----- 1-- --- ----- 0 ---- + +# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets) +PRF 1100010 00 -1 ----- 0-- --- ----- 0 ---- + +# SVE 64-bit gather prefetch (vector plus immediate) +PRF 1100010 -- 00 ----- 111 --- ----- 0 ---- + +### SVE Memory Store Group + +# SVE store predicate register +STR_pri 1110010 11 0. ..... 000 ... ..... 0 .... @pd_rn_i9 + +# SVE store vector register +STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9 + +# SVE contiguous store (scalar plus immediate) +# ST1B, ST1H, ST1W, ST1D; require msz <= esz +ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \ + @rpri_store_msz nreg=0 + +# SVE contiguous store (scalar plus scalar) +# ST1B, ST1H, ST1W, ST1D; require msz <= esz +# Enumerate msz lest we conflict with STR_zri. +ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=0 +ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=1 +ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \ + @rprr_store_esz_n0 msz=2 +ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \ + @rprr_store msz=3 esz=3 nreg=0 + +# SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0) +# SVE store multiple structures (scalar plus immediate) (nreg != 0) +ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \ + @rpri_store_msz esz=%size_23 + +# SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0) +# SVE store multiple structures (scalar plus scalar) (nreg != 0) +ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \ + @rprr_store esz=%size_23 + +# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets) +# Require msz > 0 && msz <= esz. +ST1_zprz 1110010 .. 11 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=2 scale=1 +ST1_zprz 1110010 .. 11 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=2 scale=1 + +# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets) +# Require msz <= esz. +ST1_zprz 1110010 .. 10 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=2 scale=0 +ST1_zprz 1110010 .. 10 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=2 scale=0 + +# SVE 64-bit scatter store (scalar plus 64-bit scaled offset) +# Require msz > 0 +ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \ + @rprr_scatter_store xs=2 esz=3 scale=1 + +# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset) +ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \ + @rprr_scatter_store xs=2 esz=3 scale=0 + +# SVE 64-bit scatter store (vector plus immediate) +ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \ + @rpri_scatter_store esz=3 + +# SVE 32-bit scatter store (vector plus immediate) +ST1_zpiz 1110010 .. 11 ..... 101 ... ..... ..... \ + @rpri_scatter_store esz=2 + +# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset) +# Require msz > 0 +ST1_zprz 1110010 .. 01 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=3 scale=1 +ST1_zprz 1110010 .. 01 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=3 scale=1 + +# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset) +ST1_zprz 1110010 .. 00 ..... 100 ... ..... ..... \ + @rprr_scatter_store xs=0 esz=3 scale=0 +ST1_zprz 1110010 .. 00 ..... 110 ... ..... ..... \ + @rprr_scatter_store xs=1 esz=3 scale=0 diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index 128bbf9b04..a03ca77354 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -369,7 +369,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) #define DO_MUL(N, M) (N * M) -#define DO_DIV(N, M) (M ? N / M : 0) + + +/* + * We must avoid the C undefined behaviour cases: division by + * zero and signed division of INT_MIN by -1. Both of these + * have architecturally defined required results for Arm. + * We special case all signed divisions by -1 to avoid having + * to deduce the minimum integer for the type involved. + */ +#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M) +#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M) DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) @@ -477,11 +487,11 @@ DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) -DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV) -DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV) +DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV) +DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV) -DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV) -DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV) +DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV) +DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV) /* Note that all bits of the shift are significant and not modulo the element size. */ @@ -995,6 +1005,47 @@ void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc) } } +/* Copy Zn into Zd, and store zero into inactive elements. */ +void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & expand_pred_b(pg[H1(i)]); + } +} + +void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & expand_pred_h(pg[H1(i)]); + } +} + +void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[i] & expand_pred_s(pg[H1(i)]); + } +} + +void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; + uint64_t *d = vd, *n = vn; + uint8_t *pg = vg; + for (i = 0; i < opr_sz; i += 1) { + d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1); + } +} + /* Three-operand expander, immediate operand, controlled by a predicate. */ #define DO_ZPZI(NAME, TYPE, H, OP) \ @@ -2810,3 +2861,1817 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc) return predtest_ones(d, oprsz, esz_mask); } + +/* Recursive reduction on a function; + * C.f. the ARM ARM function ReducePredicated. + * + * While it would be possible to write this without the DATA temporary, + * it is much simpler to process the predicate register this way. + * The recursion is bounded to depth 7 (128 fp16 elements), so there's + * little to gain with a more complex non-recursive form. + */ +#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \ +static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ +{ \ + if (n == 1) { \ + return *data; \ + } else { \ + uintptr_t half = n / 2; \ + TYPE lo = NAME##_reduce(data, status, half); \ + TYPE hi = NAME##_reduce(data + half, status, half); \ + return TYPE##_##FUNC(lo, hi, status); \ + } \ +} \ +uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \ +{ \ + uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \ + TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ + for (; i < maxsz; i += sizeof(TYPE)) { \ + *(TYPE *)((void *)data + i) = IDENT; \ + } \ + return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \ +} + +DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) +DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) +DO_REDUCE(sve_faddv_d, float64, , add, float64_zero) + +/* Identity is floatN_default_nan, without the function call. */ +DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) +DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) +DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL) + +DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) +DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) +DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL) + +DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) +DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) +DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity) + +DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) +DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) +DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity)) + +#undef DO_REDUCE + +uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc); + float16 result = nn; + + do { + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + float16 mm = *(float16 *)(vm + H1_2(i)); + result = float16_add(result, mm, status); + } + i += sizeof(float16), pg >>= sizeof(float16); + } while (i & 15); + } while (i < opr_sz); + + return result; +} + +uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc); + float32 result = nn; + + do { + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); + do { + if (pg & 1) { + float32 mm = *(float32 *)(vm + H1_2(i)); + result = float32_add(result, mm, status); + } + i += sizeof(float32), pg >>= sizeof(float32); + } while (i & 15); + } while (i < opr_sz); + + return result; +} + +uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg, + void *status, uint32_t desc) +{ + intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8; + uint64_t *m = vm; + uint8_t *pg = vg; + + for (i = 0; i < opr_sz; i++) { + if (pg[H1(i)] & 1) { + nn = float64_add(nn, m[i], status); + } + } + + return nn; +} + +/* Fully general three-operand expander, controlled by a predicate, + * With the extra float_status parameter. + */ +#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add) +DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add) +DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add) + +DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub) +DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub) +DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub) + +DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul) +DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul) +DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul) + +DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div) +DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div) +DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div) + +DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min) +DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min) +DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min) + +DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) +DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) +DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max) + +DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) +DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) +DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum) + +DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum) +DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum) +DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum) + +static inline float16 abd_h(float16 a, float16 b, float_status *s) +{ + return float16_abs(float16_sub(a, b, s)); +} + +static inline float32 abd_s(float32 a, float32 b, float_status *s) +{ + return float32_abs(float32_sub(a, b, s)); +} + +static inline float64 abd_d(float64 a, float64 b, float_status *s) +{ + return float64_abs(float64_sub(a, b, s)); +} + +DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) +DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) +DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d) + +static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) +{ + int b_int = MIN(MAX(b, INT_MIN), INT_MAX); + return float64_scalbn(a, b_int, s); +} + +DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn) +DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn) +DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d) + +DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh) +DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs) +DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd) + +#undef DO_ZPZZ_FP + +/* Three-operand expander, with one scalar operand, controlled by + * a predicate, with the extra float_status parameter. + */ +#define DO_ZPZS_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + TYPE mm = scalar; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add) +DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add) +DO_ZPZS_FP(sve_fadds_d, float64, , float64_add) + +DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub) +DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub) +DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub) + +DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul) +DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul) +DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul) + +static inline float16 subr_h(float16 a, float16 b, float_status *s) +{ + return float16_sub(b, a, s); +} + +static inline float32 subr_s(float32 a, float32 b, float_status *s) +{ + return float32_sub(b, a, s); +} + +static inline float64 subr_d(float64 a, float64 b, float_status *s) +{ + return float64_sub(b, a, s); +} + +DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h) +DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s) +DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d) + +DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum) +DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum) +DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum) + +DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum) +DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum) +DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum) + +DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max) +DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max) +DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max) + +DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) +DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) +DO_ZPZS_FP(sve_fmins_d, float64, , float64_min) + +/* Fully general two-operand expander, controlled by a predicate, + * With the extra float_status parameter. + */ +#define DO_ZPZ_FP(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc); \ + uint64_t *g = vg; \ + do { \ + uint64_t pg = g[(i - 1) >> 6]; \ + do { \ + i -= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, status); \ + } \ + } while (i & 63); \ + } while (i != 0); \ +} + +/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore + * FZ16. When converting from fp16, this affects flushing input denormals; + * when converting to fp16, this affects flushing output denormals. + */ +static inline float32 sve_f16_to_f32(float16 f, float_status *fpst) +{ + flag save = get_flush_inputs_to_zero(fpst); + float32 ret; + + set_flush_inputs_to_zero(false, fpst); + ret = float16_to_float32(f, true, fpst); + set_flush_inputs_to_zero(save, fpst); + return ret; +} + +static inline float64 sve_f16_to_f64(float16 f, float_status *fpst) +{ + flag save = get_flush_inputs_to_zero(fpst); + float64 ret; + + set_flush_inputs_to_zero(false, fpst); + ret = float16_to_float64(f, true, fpst); + set_flush_inputs_to_zero(save, fpst); + return ret; +} + +static inline float16 sve_f32_to_f16(float32 f, float_status *fpst) +{ + flag save = get_flush_to_zero(fpst); + float16 ret; + + set_flush_to_zero(false, fpst); + ret = float32_to_float16(f, true, fpst); + set_flush_to_zero(save, fpst); + return ret; +} + +static inline float16 sve_f64_to_f16(float64 f, float_status *fpst) +{ + flag save = get_flush_to_zero(fpst); + float16 ret; + + set_flush_to_zero(false, fpst); + ret = float64_to_float16(f, true, fpst); + set_flush_to_zero(save, fpst); + return ret; +} + +static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_int16_round_to_zero(f, s); +} + +static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_int64_round_to_zero(f, s); +} + +static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s) +{ + if (float32_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float32_to_int64_round_to_zero(f, s); +} + +static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s) +{ + if (float64_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float64_to_int64_round_to_zero(f, s); +} + +static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_uint16_round_to_zero(f, s); +} + +static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s) +{ + if (float16_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float16_to_uint64_round_to_zero(f, s); +} + +static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s) +{ + if (float32_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float32_to_uint64_round_to_zero(f, s); +} + +static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s) +{ + if (float64_is_any_nan(f)) { + float_raise(float_flag_invalid, s); + return 0; + } + return float64_to_uint64_round_to_zero(f, s); +} + +DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16) +DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32) +DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16) +DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64) +DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32) +DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64) + +DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz) +DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh) +DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs) +DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz) +DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd) +DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz) + +DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz) +DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh) +DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs) +DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz) +DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd) +DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz) + +DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth) +DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints) +DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd) + +DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int) +DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int) +DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int) + +DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16) +DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32) +DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64) + +DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt) +DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt) +DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt) + +DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16) +DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16) +DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32) +DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64) +DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16) +DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32) +DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64) + +DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16) +DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16) +DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32) +DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64) +DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16) +DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32) +DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64) + +#undef DO_ZPZ_FP + +/* 4-operand predicated multiply-add. This requires 7 operands to pass + * "properly", so we need to encode some of the registers into DESC. + */ +QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32); + +static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc, + uint16_t neg1, uint16_t neg3) +{ + intptr_t i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 2; + if (likely((pg >> (i & 63)) & 1)) { + float16 e1, e2, e3, r; + + e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; + e2 = *(uint16_t *)(vm + H1_2(i)); + e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; + r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status); + *(uint16_t *)(vd + H1_2(i)) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_h(env, vg, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0); +} + +void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000); +} + +void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000); +} + +static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc, + uint32_t neg1, uint32_t neg3) +{ + intptr_t i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 4; + if (likely((pg >> (i & 63)) & 1)) { + float32 e1, e2, e3, r; + + e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; + e2 = *(uint32_t *)(vm + H1_4(i)); + e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; + r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status); + *(uint32_t *)(vd + H1_4(i)) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_s(env, vg, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0); +} + +void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000); +} + +void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000); +} + +static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc, + uint64_t neg1, uint64_t neg3) +{ + intptr_t i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + i -= 8; + if (likely((pg >> (i & 63)) & 1)) { + float64 e1, e2, e3, r; + + e1 = *(uint64_t *)(vn + i) ^ neg1; + e2 = *(uint64_t *)(vm + i); + e3 = *(uint64_t *)(va + i) ^ neg3; + r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status); + *(uint64_t *)(vd + i) = r; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_d(env, vg, desc, 0, 0); +} + +void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0); +} + +void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN); +} + +void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc) +{ + do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN); +} + +/* Two operand floating-point comparison controlled by a predicate. + * Unlike the integer version, we are not allowed to optimistically + * compare operands, since the comparison may have side effects wrt + * the FPSR. + */ +#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ + uint64_t *d = vd, *g = vg; \ + do { \ + uint64_t out = 0, pg = g[j]; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + if (likely((pg >> (i & 63)) & 1)) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + out |= OP(TYPE, nn, mm, status); \ + } \ + } while (i & 63); \ + d[j--] = out; \ + } while (i > 0); \ +} + +#define DO_FPCMP_PPZZ_H(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP) +#define DO_FPCMP_PPZZ_S(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP) +#define DO_FPCMP_PPZZ_D(NAME, OP) \ + DO_FPCMP_PPZZ(NAME##_d, float64, , OP) + +#define DO_FPCMP_PPZZ_ALL(NAME, OP) \ + DO_FPCMP_PPZZ_H(NAME, OP) \ + DO_FPCMP_PPZZ_S(NAME, OP) \ + DO_FPCMP_PPZZ_D(NAME, OP) + +#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0 +#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0 +#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0 +#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0 +#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0 +#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0 +#define DO_FCMUO(TYPE, X, Y, ST) \ + TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered +#define DO_FACGE(TYPE, X, Y, ST) \ + TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0 +#define DO_FACGT(TYPE, X, Y, ST) \ + TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0 + +DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE) +DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT) +DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ) +DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE) +DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO) +DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE) +DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT) + +#undef DO_FPCMP_PPZZ_ALL +#undef DO_FPCMP_PPZZ_D +#undef DO_FPCMP_PPZZ_S +#undef DO_FPCMP_PPZZ_H +#undef DO_FPCMP_PPZZ + +/* One operand floating-point comparison against zero, controlled + * by a predicate. + */ +#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vg, \ + void *status, uint32_t desc) \ +{ \ + intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \ + uint64_t *d = vd, *g = vg; \ + do { \ + uint64_t out = 0, pg = g[j]; \ + do { \ + i -= sizeof(TYPE), out <<= sizeof(TYPE); \ + if ((pg >> (i & 63)) & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + out |= OP(TYPE, nn, 0, status); \ + } \ + } while (i & 63); \ + d[j--] = out; \ + } while (i > 0); \ +} + +#define DO_FPCMP_PPZ0_H(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP) +#define DO_FPCMP_PPZ0_S(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP) +#define DO_FPCMP_PPZ0_D(NAME, OP) \ + DO_FPCMP_PPZ0(NAME##_d, float64, , OP) + +#define DO_FPCMP_PPZ0_ALL(NAME, OP) \ + DO_FPCMP_PPZ0_H(NAME, OP) \ + DO_FPCMP_PPZ0_S(NAME, OP) \ + DO_FPCMP_PPZ0_D(NAME, OP) + +DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE) +DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT) +DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE) +DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT) +DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ) +DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE) + +/* FP Trig Multiply-Add. */ + +void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float16 coeff[16] = { + 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); + intptr_t x = simd_data(desc); + float16 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float16 mm = m[i]; + intptr_t xx = x; + if (float16_is_neg(mm)) { + mm = float16_abs(mm); + xx += 8; + } + d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float32 coeff[16] = { + 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9, + 0x36369d6d, 0x00000000, 0x00000000, 0x00000000, + 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705, + 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); + intptr_t x = simd_data(desc); + float32 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float32 mm = m[i]; + intptr_t xx = x; + if (float32_is_neg(mm)) { + mm = float32_abs(mm); + xx += 8; + } + d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc) +{ + static const float64 coeff[16] = { + 0x3ff0000000000000ull, 0xbfc5555555555543ull, + 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull, + 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull, + 0x3de5d8408868552full, 0x0000000000000000ull, + 0x3ff0000000000000ull, 0xbfe0000000000000ull, + 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull, + 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull, + 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, + }; + intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); + intptr_t x = simd_data(desc); + float64 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { + float64 mm = m[i]; + intptr_t xx = x; + if (float64_is_neg(mm)) { + mm = float64_abs(mm); + xx += 8; + } + d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs); + } +} + +/* + * FP Complex Add + */ + +void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float16 neg_imag = float16_set_sign(0, simd_data(desc)); + float16 neg_real = float16_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float16 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float16); + i -= 2 * sizeof(float16); + + e0 = *(float16 *)(vn + H1_2(i)); + e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float16 *)(vn + H1_2(j)); + e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float32 neg_imag = float32_set_sign(0, simd_data(desc)); + float32 neg_real = float32_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float32 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float32); + i -= 2 * sizeof(float32); + + e0 = *(float32 *)(vn + H1_2(i)); + e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float32 *)(vn + H1_2(j)); + e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, + void *vs, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + uint64_t *g = vg; + float64 neg_imag = float64_set_sign(0, simd_data(desc)); + float64 neg_real = float64_chs(neg_imag); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float64 e0, e1, e2, e3; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float64); + i -= 2 * sizeof(float64); + + e0 = *(float64 *)(vn + H1_2(i)); + e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; + e2 = *(float64 *)(vn + H1_2(j)); + e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs); + } + if (likely((pg >> (j & 63)) & 1)) { + *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs); + } + } while (i & 63); + } while (i != 0); +} + +/* + * FP Complex Multiply + */ + +QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32); + +void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2); + bool flip = rot & 1; + float16 neg_imag, neg_real; + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + neg_imag = float16_set_sign(0, (rot & 2) != 0); + neg_real = float16_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float16 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float16); + i -= 2 * sizeof(float16); + + nr = *(float16 *)(vn + H1_2(i)); + ni = *(float16 *)(vn + H1_2(j)); + mr = *(float16 *)(vm + H1_2(i)); + mi = *(float16 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float16 *)(va + H1_2(i)); + d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16); + *(float16 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float16 *)(va + H1_2(j)); + d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16); + *(float16 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2); + bool flip = rot & 1; + float32 neg_imag, neg_real; + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + neg_imag = float32_set_sign(0, (rot & 2) != 0); + neg_real = float32_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float32 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float32); + i -= 2 * sizeof(float32); + + nr = *(float32 *)(vn + H1_2(i)); + ni = *(float32 *)(vn + H1_2(j)); + mr = *(float32 *)(vm + H1_2(i)); + mi = *(float32 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float32 *)(va + H1_2(i)); + d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status); + *(float32 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float32 *)(va + H1_2(j)); + d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status); + *(float32 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc) +{ + intptr_t j, i = simd_oprsz(desc); + unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5); + unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5); + unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5); + unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5); + unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2); + bool flip = rot & 1; + float64 neg_imag, neg_real; + void *vd = &env->vfp.zregs[rd]; + void *vn = &env->vfp.zregs[rn]; + void *vm = &env->vfp.zregs[rm]; + void *va = &env->vfp.zregs[ra]; + uint64_t *g = vg; + + neg_imag = float64_set_sign(0, (rot & 2) != 0); + neg_real = float64_set_sign(0, rot == 1 || rot == 2); + + do { + uint64_t pg = g[(i - 1) >> 6]; + do { + float64 e1, e2, e3, e4, nr, ni, mr, mi, d; + + /* I holds the real index; J holds the imag index. */ + j = i - sizeof(float64); + i -= 2 * sizeof(float64); + + nr = *(float64 *)(vn + H1_2(i)); + ni = *(float64 *)(vn + H1_2(j)); + mr = *(float64 *)(vm + H1_2(i)); + mi = *(float64 *)(vm + H1_2(j)); + + e2 = (flip ? ni : nr); + e1 = (flip ? mi : mr) ^ neg_real; + e4 = e2; + e3 = (flip ? mr : mi) ^ neg_imag; + + if (likely((pg >> (i & 63)) & 1)) { + d = *(float64 *)(va + H1_2(i)); + d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status); + *(float64 *)(vd + H1_2(i)) = d; + } + if (likely((pg >> (j & 63)) & 1)) { + d = *(float64 *)(va + H1_2(j)); + d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status); + *(float64 *)(vd + H1_2(j)) = d; + } + } while (i & 63); + } while (i != 0); +} + +/* + * Load contiguous data, protected by a governing predicate. + */ +#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \ +static void do_##NAME(CPUARMState *env, void *vd, void *vg, \ + target_ulong addr, intptr_t oprsz, \ + uintptr_t ra) \ +{ \ + intptr_t i = 0; \ + do { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m = 0; \ + if (pg & 1) { \ + m = FN(env, addr, ra); \ + } \ + *(TYPEE *)(vd + H(i)) = m; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += sizeof(TYPEM); \ + } while (i & 15); \ + } while (i < oprsz); \ +} \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \ + addr, simd_oprsz(desc), GETPC()); \ +} + +#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m1 = 0, m2 = 0; \ + if (pg & 1) { \ + m1 = FN(env, addr, ra); \ + m2 = FN(env, addr + sizeof(TYPEM), ra); \ + } \ + *(TYPEE *)(d1 + H(i)) = m1; \ + *(TYPEE *)(d2 + H(i)) = m2; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 2 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m1 = 0, m2 = 0, m3 = 0; \ + if (pg & 1) { \ + m1 = FN(env, addr, ra); \ + m2 = FN(env, addr + sizeof(TYPEM), ra); \ + m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \ + } \ + *(TYPEE *)(d1 + H(i)) = m1; \ + *(TYPEE *)(d2 + H(i)) = m2; \ + *(TYPEE *)(d3 + H(i)) = m3; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 3 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ + void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \ + if (pg & 1) { \ + m1 = FN(env, addr, ra); \ + m2 = FN(env, addr + sizeof(TYPEM), ra); \ + m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \ + m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \ + } \ + *(TYPEE *)(d1 + H(i)) = m1; \ + *(TYPEE *)(d2 + H(i)) = m2; \ + *(TYPEE *)(d3 + H(i)) = m3; \ + *(TYPEE *)(d4 + H(i)) = m4; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 4 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2) +DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2) +DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4) +DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4) +DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, ) +DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, ) + +DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4) +DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4) +DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, ) +DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, ) + +DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, ) +DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, ) + +DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) + +DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) + +DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) + +DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) +DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) +DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) +DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) + +#undef DO_LD1 +#undef DO_LD2 +#undef DO_LD3 +#undef DO_LD4 + +/* + * Load contiguous data, first-fault and no-fault. + */ + +#ifdef CONFIG_USER_ONLY + +/* Fault on byte I. All bits in FFR from I are cleared. The vector + * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE + * option, which leaves subsequent data unchanged. + */ +static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) +{ + uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p; + + if (i & 63) { + ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63); + i = ROUND_UP(i, 64); + } + for (; i < oprsz; i += 64) { + ffr[i / 64] = 0; + } +} + +/* Hold the mmap lock during the operation so that there is no race + * between page_check_range and the load operation. We expect the + * usual case to have no faults at all, so we check the whole range + * first and if successful defer to the normal load operation. + * + * TODO: Change mmap_lock to a rwlock so that multiple readers + * can run simultaneously. This will probably help other uses + * within QEMU as well. + */ +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ +static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \ + target_ulong addr, intptr_t oprsz, \ + bool first, uintptr_t ra) \ +{ \ + intptr_t i = 0; \ + do { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m = 0; \ + if (pg & 1) { \ + if (!first && \ + unlikely(page_check_range(addr, sizeof(TYPEM), \ + PAGE_READ))) { \ + record_fault(env, i, oprsz); \ + return; \ + } \ + m = FN(env, addr, ra); \ + first = false; \ + } \ + *(TYPEE *)(vd + H(i)) = m; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += sizeof(TYPEM); \ + } while (i & 15); \ + } while (i < oprsz); \ +} \ +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + unsigned rd = simd_data(desc); \ + void *vd = &env->vfp.zregs[rd]; \ + mmap_lock(); \ + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ + } else { \ + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \ + } \ + mmap_unlock(); \ +} + +/* No-fault loads are like first-fault loads without the + * first faulting special case. + */ +#define DO_LDNF1(PART) \ +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + unsigned rd = simd_data(desc); \ + void *vd = &env->vfp.zregs[rd]; \ + mmap_lock(); \ + if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \ + do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \ + } else { \ + do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \ + } \ + mmap_unlock(); \ +} + +#else + +/* TODO: System mode is not yet supported. + * This would probably use tlb_vaddr_to_host. + */ +#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \ +void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + g_assert_not_reached(); \ +} + +#define DO_LDNF1(PART) \ +void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + g_assert_not_reached(); \ +} + +#endif + +DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1) +DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2) +DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2) +DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4) +DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4) +DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, ) +DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, ) + +DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2) +DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4) +DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4) +DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, ) +DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, ) + +DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4) +DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, ) +DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, ) + +DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, ) + +#undef DO_LDFF1 + +DO_LDNF1(bb_r) +DO_LDNF1(bhu_r) +DO_LDNF1(bhs_r) +DO_LDNF1(bsu_r) +DO_LDNF1(bss_r) +DO_LDNF1(bdu_r) +DO_LDNF1(bds_r) + +DO_LDNF1(hh_r) +DO_LDNF1(hsu_r) +DO_LDNF1(hss_r) +DO_LDNF1(hdu_r) +DO_LDNF1(hds_r) + +DO_LDNF1(ss_r) +DO_LDNF1(sdu_r) +DO_LDNF1(sds_r) + +DO_LDNF1(dd_r) + +#undef DO_LDNF1 + +/* + * Store contiguous data, protected by a governing predicate. + */ +#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *vd = &env->vfp.zregs[rd]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPEM m = *(TYPEE *)(vd + H(i)); \ + FN(env, addr, m, ra); \ + } \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +#define DO_ST1_D(NAME, FN, TYPEM) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc) / 8; \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + uint64_t *d = &env->vfp.zregs[rd].d[0]; \ + uint8_t *pg = vg; \ + for (i = 0; i < oprsz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + FN(env, addr, d[i], ra); \ + } \ + addr += sizeof(TYPEM); \ + } \ +} + +#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPEM m1 = *(TYPEE *)(d1 + H(i)); \ + TYPEM m2 = *(TYPEE *)(d2 + H(i)); \ + FN(env, addr, m1, ra); \ + FN(env, addr + sizeof(TYPEM), m2, ra); \ + } \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 2 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPEM m1 = *(TYPEE *)(d1 + H(i)); \ + TYPEM m2 = *(TYPEE *)(d2 + H(i)); \ + TYPEM m3 = *(TYPEE *)(d3 + H(i)); \ + FN(env, addr, m1, ra); \ + FN(env, addr + sizeof(TYPEM), m2, ra); \ + FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \ + } \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 3 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \ +void HELPER(NAME)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + intptr_t ra = GETPC(); \ + unsigned rd = simd_data(desc); \ + void *d1 = &env->vfp.zregs[rd]; \ + void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \ + void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \ + void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPEM m1 = *(TYPEE *)(d1 + H(i)); \ + TYPEM m2 = *(TYPEE *)(d2 + H(i)); \ + TYPEM m3 = *(TYPEE *)(d3 + H(i)); \ + TYPEM m4 = *(TYPEE *)(d4 + H(i)); \ + FN(env, addr, m1, ra); \ + FN(env, addr + sizeof(TYPEM), m2, ra); \ + FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \ + FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \ + } \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + addr += 4 * sizeof(TYPEM); \ + } while (i & 15); \ + } \ +} + +DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2) +DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4) +DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t) + +DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4) +DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t) + +DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t) + +DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1) +DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1) +DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1) +DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1) + +DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2) +DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2) +DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2) +DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2) + +DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4) +DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4) +DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4) +DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4) + +DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t) + +void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg, + target_ulong addr, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc) / 8; + intptr_t ra = GETPC(); + unsigned rd = simd_data(desc); + uint64_t *d1 = &env->vfp.zregs[rd].d[0]; + uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0]; + uint8_t *pg = vg; + + for (i = 0; i < oprsz; i += 1) { + if (pg[H1(i)] & 1) { + cpu_stq_data_ra(env, addr, d1[i], ra); + cpu_stq_data_ra(env, addr + 8, d2[i], ra); + } + addr += 2 * 8; + } +} + +void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg, + target_ulong addr, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc) / 8; + intptr_t ra = GETPC(); + unsigned rd = simd_data(desc); + uint64_t *d1 = &env->vfp.zregs[rd].d[0]; + uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0]; + uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0]; + uint8_t *pg = vg; + + for (i = 0; i < oprsz; i += 1) { + if (pg[H1(i)] & 1) { + cpu_stq_data_ra(env, addr, d1[i], ra); + cpu_stq_data_ra(env, addr + 8, d2[i], ra); + cpu_stq_data_ra(env, addr + 16, d3[i], ra); + } + addr += 3 * 8; + } +} + +void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg, + target_ulong addr, uint32_t desc) +{ + intptr_t i, oprsz = simd_oprsz(desc) / 8; + intptr_t ra = GETPC(); + unsigned rd = simd_data(desc); + uint64_t *d1 = &env->vfp.zregs[rd].d[0]; + uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0]; + uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0]; + uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0]; + uint8_t *pg = vg; + + for (i = 0; i < oprsz; i += 1) { + if (pg[H1(i)] & 1) { + cpu_stq_data_ra(env, addr, d1[i], ra); + cpu_stq_data_ra(env, addr + 8, d2[i], ra); + cpu_stq_data_ra(env, addr + 16, d3[i], ra); + cpu_stq_data_ra(env, addr + 24, d4[i], ra); + } + addr += 4 * 8; + } +} + +/* Loads with a vector index. */ + +#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + unsigned scale = simd_data(desc); \ + uintptr_t ra = GETPC(); \ + for (i = 0; i < oprsz; i++) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m = 0; \ + if (pg & 1) { \ + target_ulong off = *(TYPEI *)(vm + H1_4(i)); \ + m = FN(env, base + (off << scale), ra); \ + } \ + *(uint32_t *)(vd + H1_4(i)) = m; \ + i += 4, pg >>= 4; \ + } while (i & 15); \ + } \ +} + +#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc) / 8; \ + unsigned scale = simd_data(desc); \ + uintptr_t ra = GETPC(); \ + uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \ + for (i = 0; i < oprsz; i++) { \ + TYPEM mm = 0; \ + if (pg[H1(i)] & 1) { \ + target_ulong off = (TYPEI)m[i]; \ + mm = FN(env, base + (off << scale), ra); \ + } \ + d[i] = mm; \ + } \ +} + +DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra) +DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra) + +DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra) +DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra) + +DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra) +DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra) +DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra) + +DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra) +DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra) +DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra) + +DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra) +DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra) +DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra) +DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra) +DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra) + +/* First fault loads with a vector index. */ + +#ifdef CONFIG_USER_ONLY + +#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + unsigned scale = simd_data(desc); \ + uintptr_t ra = GETPC(); \ + bool first = true; \ + mmap_lock(); \ + for (i = 0; i < oprsz; i++) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + TYPEM m = 0; \ + if (pg & 1) { \ + target_ulong off = *(TYPEI *)(vm + H(i)); \ + target_ulong addr = base + (off << scale); \ + if (!first && \ + page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \ + record_fault(env, i, oprsz); \ + goto exit; \ + } \ + m = FN(env, addr, ra); \ + first = false; \ + } \ + *(TYPEE *)(vd + H(i)) = m; \ + i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \ + } while (i & 15); \ + } \ + exit: \ + mmap_unlock(); \ +} + +#else + +#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + g_assert_not_reached(); \ +} + +#endif + +#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \ + DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4) +#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \ + DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, ) + +DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra) + +DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra) + +DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra) + +DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra) + +DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra) +DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra) + +/* Stores with a vector index. */ + +#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + unsigned scale = simd_data(desc); \ + uintptr_t ra = GETPC(); \ + for (i = 0; i < oprsz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (likely(pg & 1)) { \ + target_ulong off = *(TYPEI *)(vm + H1_4(i)); \ + uint32_t d = *(uint32_t *)(vd + H1_4(i)); \ + FN(env, base + (off << scale), d, ra); \ + } \ + i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \ + } while (i & 15); \ + } \ +} + +#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \ +void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \ + target_ulong base, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc) / 8; \ + unsigned scale = simd_data(desc); \ + uintptr_t ra = GETPC(); \ + uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \ + for (i = 0; i < oprsz; i++) { \ + if (likely(pg[H1(i)] & 1)) { \ + target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \ + FN(env, base + off, d[i], ra); \ + } \ + } \ +} + +DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra) +DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra) +DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra) + +DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra) +DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra) +DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra) + +DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra) +DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra) +DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra) +DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra) + +DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra) +DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra) +DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra) +DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra) + +DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra) +DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra) +DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra) +DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 8d8a4cecb0..45a6c2a3aa 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -640,6 +640,16 @@ static void gen_gvec_op3(DisasContext *s, bool is_q, int rd, vec_full_reg_size(s), gvec_op); } +/* Expand a 3-operand operation using an out-of-line helper. */ +static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd, + int rn, int rm, int data, gen_helper_gvec_3 *fn) +{ + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); +} + /* Expand a 3-operand + env pointer operation using * an out-of-line helper. */ @@ -1623,11 +1633,10 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread, default: break; } - if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) { - return; - } if ((ri->type & ARM_CP_FPU) && !fp_access_check(s)) { return; + } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) { + return; } if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) { @@ -11336,6 +11345,14 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) } feature = ARM_FEATURE_V8_RDM; break; + case 0x02: /* SDOT (vector) */ + case 0x12: /* UDOT (vector) */ + if (size != MO_32) { + unallocated_encoding(s); + return; + } + feature = ARM_FEATURE_V8_DOTPROD; + break; case 0x8: /* FCMLA, #0 */ case 0x9: /* FCMLA, #90 */ case 0xa: /* FCMLA, #180 */ @@ -11389,6 +11406,11 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) } return; + case 0x2: /* SDOT / UDOT */ + gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0, + u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b); + return; + case 0x8: /* FCMLA, #0 */ case 0x9: /* FCMLA, #90 */ case 0xa: /* FCMLA, #180 */ @@ -12568,6 +12590,13 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) return; } break; + case 0x0e: /* SDOT */ + case 0x1e: /* UDOT */ + if (size != MO_32 || !arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) { + unallocated_encoding(s); + return; + } + break; case 0x11: /* FCMLA #0 */ case 0x13: /* FCMLA #90 */ case 0x15: /* FCMLA #180 */ @@ -12665,19 +12694,28 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) } switch (16 * u + opcode) { + case 0x0e: /* SDOT */ + case 0x1e: /* UDOT */ + gen_gvec_op3_ool(s, is_q, rd, rn, rm, index, + u ? gen_helper_gvec_udot_idx_b + : gen_helper_gvec_sdot_idx_b); + return; case 0x11: /* FCMLA #0 */ case 0x13: /* FCMLA #90 */ case 0x15: /* FCMLA #180 */ case 0x17: /* FCMLA #270 */ - tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), - vec_full_reg_offset(s, rn), - vec_reg_offset(s, rm, index, size), fpst, - is_q ? 16 : 8, vec_full_reg_size(s), - extract32(insn, 13, 2), /* rot */ - size == MO_64 - ? gen_helper_gvec_fcmlas_idx - : gen_helper_gvec_fcmlah_idx); - tcg_temp_free_ptr(fpst); + { + int rot = extract32(insn, 13, 2); + int data = (index << 2) | rot; + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), data, + size == MO_64 + ? gen_helper_gvec_fcmlas_idx + : gen_helper_gvec_fcmlah_idx); + tcg_temp_free_ptr(fpst); + } return; } diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 226c97579c..c080345b9c 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -32,6 +32,7 @@ #include "exec/log.h" #include "trace-tcg.h" #include "translate-a64.h" +#include "fpu/softfloat.h" typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t, @@ -42,6 +43,10 @@ typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32); +typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i64, TCGv_i32); + /* * Helpers for extracting complex instruction fields. */ @@ -82,6 +87,15 @@ static inline int expand_imm_sh8u(int x) return (uint8_t)x << (x & 0x100 ? 8 : 0); } +/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype) + * with unsigned data. C.f. SVE Memory Contiguous Load Group. + */ +static inline int msz_dtype(int msz) +{ + static const uint8_t dtype[4] = { 0, 5, 10, 15 }; + return dtype[msz]; +} + /* * Include the generated decoder. */ @@ -337,6 +351,23 @@ static bool do_zpzz_ool(DisasContext *s, arg_rprr_esz *a, gen_helper_gvec_4 *fn) return true; } +/* Select active elememnts from Zn and inactive elements from Zm, + * storing the result in Zd. + */ +static void do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz) +{ + static gen_helper_gvec_4 * const fns[4] = { + gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, + gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d + }; + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + pred_full_reg_offset(s, pg), + vsz, vsz, 0, fns[esz]); +} + #define DO_ZPZZ(NAME, name) \ static bool trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, \ uint32_t insn) \ @@ -387,7 +418,13 @@ static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) return do_zpzz_ool(s, a, fns[a->esz]); } -DO_ZPZZ(SEL, sel) +static bool trans_SEL_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_sel_z(s, a->rd, a->rn, a->rm, a->pg, a->esz); + } + return true; +} #undef DO_ZPZZ @@ -595,6 +632,20 @@ static bool do_clr_zp(DisasContext *s, int rd, int pg, int esz) return true; } +/* Copy Zn into Zd, storing zeros into inactive elements. */ +static void do_movz_zpz(DisasContext *s, int rd, int rn, int pg, int esz) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_movz_b, gen_helper_sve_movz_h, + gen_helper_sve_movz_s, gen_helper_sve_movz_d, + }; + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + vsz, vsz, 0, fns[esz]); +} + static bool do_zpzi_ool(DisasContext *s, arg_rpri_esz *a, gen_helper_gvec_3 *fn) { @@ -3372,6 +3423,310 @@ DO_ZZI(UMIN, umin) #undef DO_ZZI +static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[2][2] = { + { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, + { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, 0, fns[a->u][a->sz]); + } + return true; +} + +static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[2][2] = { + { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h }, + { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h } + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, a->index, fns[a->u][a->sz]); + } + return true; +} + + +/* + *** SVE Floating Point Multiply-Add Indexed Group + */ + +static bool trans_FMLA_zzxz(DisasContext *s, arg_FMLA_zzxz *a, uint32_t insn) +{ + static gen_helper_gvec_4_ptr * const fns[3] = { + gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_fmla_idx_s, + gen_helper_gvec_fmla_idx_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vec_full_reg_offset(s, a->ra), + status, vsz, vsz, (a->index << 1) | a->sub, + fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Multiply Indexed Group + */ + +static bool trans_FMUL_zzx(DisasContext *s, arg_FMUL_zzx *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_gvec_fmul_idx_h, + gen_helper_gvec_fmul_idx_s, + gen_helper_gvec_fmul_idx_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, a->index, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Fast Reduction Group + */ + +typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); + +static void do_reduce(DisasContext *s, arg_rpr_esz *a, + gen_helper_fp_reduce *fn) +{ + unsigned vsz = vec_full_reg_size(s); + unsigned p2vsz = pow2ceil(vsz); + TCGv_i32 t_desc = tcg_const_i32(simd_desc(vsz, p2vsz, 0)); + TCGv_ptr t_zn, t_pg, status; + TCGv_i64 temp; + + temp = tcg_temp_new_i64(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + status = get_fpstatus_ptr(a->esz == MO_16); + + fn(temp, t_zn, t_pg, status, t_desc); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(status); + tcg_temp_free_i32(t_desc); + + write_fp_dreg(s, a->rd, temp); + tcg_temp_free_i64(temp); +} + +#define DO_VPZ(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_fp_reduce * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_reduce(s, a, fns[a->esz - 1]); \ + } \ + return true; \ +} + +DO_VPZ(FADDV, faddv) +DO_VPZ(FMINNMV, fminnmv) +DO_VPZ(FMAXNMV, fmaxnmv) +DO_VPZ(FMINV, fminv) +DO_VPZ(FMAXV, fmaxv) + +/* + *** SVE Floating Point Unary Operations - Unpredicated Group + */ + +static void do_zz_fp(DisasContext *s, arg_rr_esz *a, gen_helper_gvec_2_ptr *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); +} + +static bool trans_FRECPE(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2_ptr * const fns[3] = { + gen_helper_gvec_frecpe_h, + gen_helper_gvec_frecpe_s, + gen_helper_gvec_frecpe_d, + }; + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + do_zz_fp(s, a, fns[a->esz - 1]); + } + return true; +} + +static bool trans_FRSQRTE(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2_ptr * const fns[3] = { + gen_helper_gvec_frsqrte_h, + gen_helper_gvec_frsqrte_s, + gen_helper_gvec_frsqrte_d, + }; + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + do_zz_fp(s, a, fns[a->esz - 1]); + } + return true; +} + +/* + *** SVE Floating Point Compare with Zero Group + */ + +static void do_ppz_fp(DisasContext *s, arg_rpr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); +} + +#define DO_PPZ(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_3_ptr * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_ppz_fp(s, a, fns[a->esz - 1]); \ + } \ + return true; \ +} + +DO_PPZ(FCMGE_ppz0, fcmge0) +DO_PPZ(FCMGT_ppz0, fcmgt0) +DO_PPZ(FCMLE_ppz0, fcmle0) +DO_PPZ(FCMLT_ppz0, fcmlt0) +DO_PPZ(FCMEQ_ppz0, fcmeq0) +DO_PPZ(FCMNE_ppz0, fcmne0) + +#undef DO_PPZ + +/* + *** SVE floating-point trig multiply-add coefficient + */ + +static bool trans_FTMAD(DisasContext *s, arg_FTMAD *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_ftmad_h, + gen_helper_sve_ftmad_s, + gen_helper_sve_ftmad_d, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, a->imm, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Accumulating Reduction Group + */ + +static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + static fadda_fn * const fns[3] = { + gen_helper_sve_fadda_h, + gen_helper_sve_fadda_s, + gen_helper_sve_fadda_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_rm, t_pg, t_fpst; + TCGv_i64 t_val; + TCGv_i32 t_desc; + + if (a->esz == 0) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz); + t_rm = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + t_fpst = get_fpstatus_ptr(a->esz == MO_16); + t_desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + + fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_fpst); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_rm); + + write_fp_dreg(s, a->rd, t_val); + tcg_temp_free_i64(t_val); + return true; +} + /* *** SVE Floating Point Arithmetic - Unpredicated Group */ @@ -3415,6 +3770,592 @@ DO_FP3(FRSQRTS, rsqrts) #undef DO_FP3 /* + *** SVE Floating Point Arithmetic - Predicated Group + */ + +static bool do_zpzz_fp(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_FP3(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rprr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_4_ptr * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_zpzz_fp(s, a, fns[a->esz]); \ +} + +DO_FP3(FADD_zpzz, fadd) +DO_FP3(FSUB_zpzz, fsub) +DO_FP3(FMUL_zpzz, fmul) +DO_FP3(FMIN_zpzz, fmin) +DO_FP3(FMAX_zpzz, fmax) +DO_FP3(FMINNM_zpzz, fminnum) +DO_FP3(FMAXNM_zpzz, fmaxnum) +DO_FP3(FABD, fabd) +DO_FP3(FSCALE, fscalbn) +DO_FP3(FDIV, fdiv) +DO_FP3(FMULX, fmulx) + +#undef DO_FP3 + +typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_i64, TCGv_ptr, TCGv_i32); + +static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16, + TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_zd, t_zn, t_pg, status; + TCGv_i32 desc; + + t_zd = tcg_temp_new_ptr(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + status = get_fpstatus_ptr(is_fp16); + desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + fn(t_zd, t_zn, t_pg, scalar, status, desc); + + tcg_temp_free_i32(desc); + tcg_temp_free_ptr(status); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_zd); +} + +static void do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm, + gen_helper_sve_fp2scalar *fn) +{ + TCGv_i64 temp = tcg_const_i64(imm); + do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16, temp, fn); + tcg_temp_free_i64(temp); +} + +#define DO_FP_IMM(NAME, name, const0, const1) \ +static bool trans_##NAME##_zpzi(DisasContext *s, arg_rpri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_sve_fp2scalar * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d \ + }; \ + static uint64_t const val[3][2] = { \ + { float16_##const0, float16_##const1 }, \ + { float32_##const0, float32_##const1 }, \ + { float64_##const0, float64_##const1 }, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_fp_imm(s, a, val[a->esz - 1][a->imm], fns[a->esz - 1]); \ + } \ + return true; \ +} + +#define float16_two make_float16(0x4000) +#define float32_two make_float32(0x40000000) +#define float64_two make_float64(0x4000000000000000ULL) + +DO_FP_IMM(FADD, fadds, half, one) +DO_FP_IMM(FSUB, fsubs, half, one) +DO_FP_IMM(FMUL, fmuls, half, two) +DO_FP_IMM(FSUBR, fsubrs, half, one) +DO_FP_IMM(FMAXNM, fmaxnms, zero, one) +DO_FP_IMM(FMINNM, fminnms, zero, one) +DO_FP_IMM(FMAX, fmaxs, zero, one) +DO_FP_IMM(FMIN, fmins, zero, one) + +#undef DO_FP_IMM + +static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_FPCMP(NAME, name) \ +static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_4_ptr * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_fp_cmp(s, a, fns[a->esz]); \ +} + +DO_FPCMP(FCMGE, fcmge) +DO_FPCMP(FCMGT, fcmgt) +DO_FPCMP(FCMEQ, fcmeq) +DO_FPCMP(FCMNE, fcmne) +DO_FPCMP(FCMUO, fcmuo) +DO_FPCMP(FACGE, facge) +DO_FPCMP(FACGT, facgt) + +#undef DO_FPCMP + +static bool trans_FCADD(DisasContext *s, arg_FCADD *a, uint32_t insn) +{ + static gen_helper_gvec_4_ptr * const fns[3] = { + gen_helper_sve_fcadd_h, + gen_helper_sve_fcadd_s, + gen_helper_sve_fcadd_d + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, a->rot, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +typedef void gen_helper_sve_fmla(TCGv_env, TCGv_ptr, TCGv_i32); + +static bool do_fmla(DisasContext *s, arg_rprrr_esz *a, gen_helper_sve_fmla *fn) +{ + if (fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned desc; + TCGv_i32 t_desc; + TCGv_ptr pg = tcg_temp_new_ptr(); + + /* We would need 7 operands to pass these arguments "properly". + * So we encode all the register numbers into the descriptor. + */ + desc = deposit32(a->rd, 5, 5, a->rn); + desc = deposit32(desc, 10, 5, a->rm); + desc = deposit32(desc, 15, 5, a->ra); + desc = simd_desc(vsz, vsz, desc); + + t_desc = tcg_const_i32(desc); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + fn(cpu_env, pg, t_desc); + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(pg); + return true; +} + +#define DO_FMLA(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rprrr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_sve_fmla * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_fmla(s, a, fns[a->esz]); \ +} + +DO_FMLA(FMLA_zpzzz, fmla_zpzzz) +DO_FMLA(FMLS_zpzzz, fmls_zpzzz) +DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz) +DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz) + +#undef DO_FMLA + +static bool trans_FCMLA_zpzzz(DisasContext *s, + arg_FCMLA_zpzzz *a, uint32_t insn) +{ + static gen_helper_sve_fmla * const fns[3] = { + gen_helper_sve_fcmla_zpzzz_h, + gen_helper_sve_fcmla_zpzzz_s, + gen_helper_sve_fcmla_zpzzz_d, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned desc; + TCGv_i32 t_desc; + TCGv_ptr pg = tcg_temp_new_ptr(); + + /* We would need 7 operands to pass these arguments "properly". + * So we encode all the register numbers into the descriptor. + */ + desc = deposit32(a->rd, 5, 5, a->rn); + desc = deposit32(desc, 10, 5, a->rm); + desc = deposit32(desc, 15, 5, a->ra); + desc = deposit32(desc, 20, 2, a->rot); + desc = sextract32(desc, 0, 22); + desc = simd_desc(vsz, vsz, desc); + + t_desc = tcg_const_i32(desc); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + fns[a->esz - 1](cpu_env, pg, t_desc); + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(pg); + } + return true; +} + +static bool trans_FCMLA_zzxz(DisasContext *s, arg_FCMLA_zzxz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[2] = { + gen_helper_gvec_fcmlah_idx, + gen_helper_gvec_fcmlas_idx, + }; + + tcg_debug_assert(a->esz == 1 || a->esz == 2); + tcg_debug_assert(a->rd == a->ra); + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, + a->index * 4 + a->rot, + fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Unary Operations Predicated Group + */ + +static bool do_zpz_ptr(DisasContext *s, int rd, int rn, int pg, + bool is_fp16, gen_helper_gvec_3_ptr *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(is_fp16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool trans_FCVT_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_sh); +} + +static bool trans_FCVT_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hs); +} + +static bool trans_FCVT_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_dh); +} + +static bool trans_FCVT_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hd); +} + +static bool trans_FCVT_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_ds); +} + +static bool trans_FCVT_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_sd); +} + +static bool trans_FCVTZS_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hh); +} + +static bool trans_FCVTZU_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hh); +} + +static bool trans_FCVTZS_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hs); +} + +static bool trans_FCVTZU_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hs); +} + +static bool trans_FCVTZS_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hd); +} + +static bool trans_FCVTZU_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hd); +} + +static bool trans_FCVTZS_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ss); +} + +static bool trans_FCVTZU_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ss); +} + +static bool trans_FCVTZS_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_sd); +} + +static bool trans_FCVTZU_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_sd); +} + +static bool trans_FCVTZS_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ds); +} + +static bool trans_FCVTZU_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ds); +} + +static bool trans_FCVTZS_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_dd); +} + +static bool trans_FCVTZU_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_dd); +} + +static gen_helper_gvec_3_ptr * const frint_fns[3] = { + gen_helper_sve_frint_h, + gen_helper_sve_frint_s, + gen_helper_sve_frint_d +}; + +static bool trans_FRINTI(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, + frint_fns[a->esz - 1]); +} + +static bool trans_FRINTX(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_frintx_h, + gen_helper_sve_frintx_s, + gen_helper_sve_frintx_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a, int mode) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 tmode = tcg_const_i32(mode); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + gen_helper_set_rmode(tmode, tmode, status); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, frint_fns[a->esz - 1]); + + gen_helper_set_rmode(tmode, tmode, status); + tcg_temp_free_i32(tmode); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool trans_FRINTN(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_nearest_even); +} + +static bool trans_FRINTP(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_up); +} + +static bool trans_FRINTM(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_down); +} + +static bool trans_FRINTZ(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_to_zero); +} + +static bool trans_FRINTA(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_ties_away); +} + +static bool trans_FRECPX(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_frecpx_h, + gen_helper_sve_frecpx_s, + gen_helper_sve_frecpx_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool trans_FSQRT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_fsqrt_h, + gen_helper_sve_fsqrt_s, + gen_helper_sve_fsqrt_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh); +} + +static bool trans_SCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_sh); +} + +static bool trans_SCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_dh); +} + +static bool trans_SCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ss); +} + +static bool trans_SCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ds); +} + +static bool trans_SCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_sd); +} + +static bool trans_SCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_dd); +} + +static bool trans_UCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_hh); +} + +static bool trans_UCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_sh); +} + +static bool trans_UCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_dh); +} + +static bool trans_UCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ss); +} + +static bool trans_UCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ds); +} + +static bool trans_UCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_sd); +} + +static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_dd); +} + +/* *** SVE Memory - 32-bit Gather and Unsized Contiguous Group */ @@ -3507,6 +4448,89 @@ static void do_ldr(DisasContext *s, uint32_t vofs, uint32_t len, tcg_temp_free_i64(t0); } +/* Similarly for stores. */ +static void do_str(DisasContext *s, uint32_t vofs, uint32_t len, + int rn, int imm) +{ + uint32_t len_align = QEMU_ALIGN_DOWN(len, 8); + uint32_t len_remain = len % 8; + uint32_t nparts = len / 8 + ctpop8(len_remain); + int midx = get_mem_index(s); + TCGv_i64 addr, t0; + + addr = tcg_temp_new_i64(); + t0 = tcg_temp_new_i64(); + + /* Note that unpredicated load/store of vector/predicate registers + * are defined as a stream of bytes, which equates to little-endian + * operations on larger quantities. There is no nice way to force + * a little-endian store for aarch64_be-linux-user out of line. + * + * Attempt to keep code expansion to a minimum by limiting the + * amount of unrolling done. + */ + if (nparts <= 4) { + int i; + + for (i = 0; i < len_align; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, vofs + i); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + } + } else { + TCGLabel *loop = gen_new_label(); + TCGv_ptr t2, i = tcg_const_local_ptr(0); + + gen_set_label(loop); + + t2 = tcg_temp_new_ptr(); + tcg_gen_add_ptr(t2, cpu_env, i); + tcg_gen_ld_i64(t0, t2, vofs); + + /* Minimize the number of local temps that must be re-read from + * the stack each iteration. Instead, re-compute values other + * than the loop counter. + */ + tcg_gen_addi_ptr(t2, i, imm); + tcg_gen_extu_ptr_i64(addr, t2); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); + tcg_temp_free_ptr(t2); + + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + + tcg_gen_addi_ptr(i, i, 8); + + tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + tcg_temp_free_ptr(i); + } + + /* Predicate register stores can be any multiple of 2. */ + if (len_remain) { + tcg_gen_ld_i64(t0, cpu_env, vofs + len_align); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); + + switch (len_remain) { + case 2: + case 4: + case 8: + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); + break; + + case 6: + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL); + tcg_gen_addi_i64(addr, addr, 4); + tcg_gen_shri_i64(t0, t0, 32); + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW); + break; + + default: + g_assert_not_reached(); + } + } + tcg_temp_free_i64(addr); + tcg_temp_free_i64(t0); +} + static bool trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn) { if (sve_access_check(s)) { @@ -3526,3 +4550,665 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a, uint32_t insn) } return true; } + +static bool trans_STR_zri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = vec_full_reg_size(s); + int off = vec_full_reg_offset(s, a->rd); + do_str(s, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_STR_pri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = pred_full_reg_size(s); + int off = pred_full_reg_offset(s, a->rd); + do_str(s, off, size, a->rn, a->imm * size); + } + return true; +} + +/* + *** SVE Memory - Contiguous Load Group + */ + +/* The memory mode of the dtype. */ +static const TCGMemOp dtype_mop[16] = { + MO_UB, MO_UB, MO_UB, MO_UB, + MO_SL, MO_UW, MO_UW, MO_UW, + MO_SW, MO_SW, MO_UL, MO_UL, + MO_SB, MO_SB, MO_SB, MO_Q +}; + +#define dtype_msz(x) (dtype_mop[x] & MO_SIZE) + +/* The vector element size of dtype. */ +static const uint8_t dtype_esz[16] = { + 0, 1, 2, 3, + 3, 1, 2, 3, + 3, 2, 2, 3, + 3, 2, 1, 3 +}; + +static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + gen_helper_gvec_mem *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + TCGv_i32 desc; + + /* For e.g. LD4, there are not enough arguments to pass all 4 + * registers as pointers, so encode the regno into the data field. + * For consistency, do this even for LD1. + */ + desc = tcg_const_i32(simd_desc(vsz, vsz, zt)); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + fn(cpu_env, t_pg, addr, desc); + + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); +} + +static void do_ld_zpa(DisasContext *s, int zt, int pg, + TCGv_i64 addr, int dtype, int nreg) +{ + static gen_helper_gvec_mem * const fns[16][4] = { + { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_r, gen_helper_sve_ld2hh_r, + gen_helper_sve_ld3hh_r, gen_helper_sve_ld4hh_r }, + { gen_helper_sve_ld1hsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_r, gen_helper_sve_ld2ss_r, + gen_helper_sve_ld3ss_r, gen_helper_sve_ld4ss_r }, + { gen_helper_sve_ld1sdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_r, gen_helper_sve_ld2dd_r, + gen_helper_sve_ld3dd_r, gen_helper_sve_ld4dd_r }, + }; + gen_helper_gvec_mem *fn = fns[dtype][nreg]; + + /* While there are holes in the table, they are not + * accessible via the instruction encoding. + */ + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, fn); +} + +static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_muli_i64(addr, cpu_reg(s, a->rm), + (a->nreg + 1) << dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) + << dtype_msz(a->dtype)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_r, + gen_helper_sve_ldff1hh_r, + gen_helper_sve_ldff1hsu_r, + gen_helper_sve_ldff1hdu_r, + + gen_helper_sve_ldff1hds_r, + gen_helper_sve_ldff1hss_r, + gen_helper_sve_ldff1ss_r, + gen_helper_sve_ldff1sdu_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_r, + }; + + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} + +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_r, + gen_helper_sve_ldnf1hh_r, + gen_helper_sve_ldnf1hsu_r, + gen_helper_sve_ldnf1hdu_r, + + gen_helper_sve_ldnf1hds_r, + gen_helper_sve_ldnf1hss_r, + gen_helper_sve_ldnf1ss_r, + gen_helper_sve_ldnf1sdu_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_r, + }; + + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + int off = (a->imm * elements) << dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} + +static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz) +{ + static gen_helper_gvec_mem * const fns[4] = { + gen_helper_sve_ld1bb_r, gen_helper_sve_ld1hh_r, + gen_helper_sve_ld1ss_r, gen_helper_sve_ld1dd_r, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + TCGv_i32 desc; + + /* Load the first quadword using the normal predicated load helpers. */ + desc = tcg_const_i32(simd_desc(16, 16, zt)); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + fns[msz](cpu_env, t_pg, addr, desc); + + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); + + /* Replicate that first quadword. */ + if (vsz > 16) { + unsigned dofs = vec_full_reg_offset(s, zt); + tcg_gen_gvec_dup_mem(4, dofs + 16, dofs, vsz - 16, vsz - 16); + } +} + +static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + if (sve_access_check(s)) { + int msz = dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ldrq(s, a->rd, a->pg, addr, msz); + } + return true; +} + +static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16); + do_ldrq(s, a->rd, a->pg, addr, dtype_msz(a->dtype)); + } + return true; +} + +/* Load and broadcast element. */ +static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned psz = pred_full_reg_size(s); + unsigned esz = dtype_esz[a->dtype]; + TCGLabel *over = gen_new_label(); + TCGv_i64 temp; + + /* If the guarding predicate has no bits set, no load occurs. */ + if (psz <= 8) { + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8); + temp = tcg_temp_new_i64(); + tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg)); + tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask); + tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over); + tcg_temp_free_i64(temp); + } else { + TCGv_i32 t32 = tcg_temp_new_i32(); + find_last_active(s, t32, esz, a->pg); + tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over); + tcg_temp_free_i32(t32); + } + + /* Load the data. */ + temp = tcg_temp_new_i64(); + tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << esz); + tcg_gen_qemu_ld_i64(temp, temp, get_mem_index(s), + s->be_data | dtype_mop[a->dtype]); + + /* Broadcast to *all* elements. */ + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, temp); + tcg_temp_free_i64(temp); + + /* Zero the inactive elements. */ + gen_set_label(over); + do_movz_zpz(s, a->rd, a->rd, a->pg, esz); + return true; +} + +static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + int msz, int esz, int nreg) +{ + static gen_helper_gvec_mem * const fn_single[4][4] = { + { gen_helper_sve_st1bb_r, gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, gen_helper_sve_st1bd_r }, + { NULL, gen_helper_sve_st1hh_r, + gen_helper_sve_st1hs_r, gen_helper_sve_st1hd_r }, + { NULL, NULL, + gen_helper_sve_st1ss_r, gen_helper_sve_st1sd_r }, + { NULL, NULL, NULL, gen_helper_sve_st1dd_r }, + }; + static gen_helper_gvec_mem * const fn_multiple[3][4] = { + { gen_helper_sve_st2bb_r, gen_helper_sve_st2hh_r, + gen_helper_sve_st2ss_r, gen_helper_sve_st2dd_r }, + { gen_helper_sve_st3bb_r, gen_helper_sve_st3hh_r, + gen_helper_sve_st3ss_r, gen_helper_sve_st3dd_r }, + { gen_helper_sve_st4bb_r, gen_helper_sve_st4hh_r, + gen_helper_sve_st4ss_r, gen_helper_sve_st4dd_r }, + }; + gen_helper_gvec_mem *fn; + + if (nreg == 0) { + /* ST1 */ + fn = fn_single[msz][esz]; + } else { + /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ + assert(msz == esz); + fn = fn_multiple[nreg - 1][msz]; + } + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, fn); +} + +static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a, uint32_t insn) +{ + if (a->rm == 31 || a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_muli_i64(addr, cpu_reg(s, a->rm), (a->nreg + 1) << a->msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a, uint32_t insn) +{ + if (a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> a->esz; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) << a->msz); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +/* + *** SVE gather loads / scatter stores + */ + +static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, int scale, + TCGv_i64 scalar, gen_helper_gvec_mem_scatter *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, scale)); + TCGv_ptr t_zm = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + TCGv_ptr t_zt = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm)); + tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt)); + fn(cpu_env, t_zt, t_pg, t_zm, scalar, desc); + + tcg_temp_free_ptr(t_zt); + tcg_temp_free_ptr(t_zm); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); +} + +/* Indexed by [ff][xs][u][msz]. */ +static gen_helper_gvec_mem_scatter * const gather_load_fn32[2][2][2][3] = { + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_zsu, + gen_helper_sve_ldssu_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_zss, + gen_helper_sve_ldssu_zss, } } }, + + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_zsu, + gen_helper_sve_ldffssu_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_zss, + gen_helper_sve_ldffssu_zss, } } } +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const gather_load_fn64[2][3][2][4] = { + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_zsu, + gen_helper_sve_ldsds_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_zsu, + gen_helper_sve_ldsdu_zsu, + gen_helper_sve_ldddu_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_zss, + gen_helper_sve_ldsds_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_zss, + gen_helper_sve_ldsdu_zss, + gen_helper_sve_ldddu_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_zd, + gen_helper_sve_ldsds_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_zd, + gen_helper_sve_ldsdu_zd, + gen_helper_sve_ldddu_zd, } } }, + + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_zsu, + gen_helper_sve_ldffsds_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_zsu, + gen_helper_sve_ldffsdu_zsu, + gen_helper_sve_ldffddu_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_zss, + gen_helper_sve_ldffsds_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_zss, + gen_helper_sve_ldffsdu_zss, + gen_helper_sve_ldffddu_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_zd, + gen_helper_sve_ldffsds_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_zd, + gen_helper_sve_ldffsdu_zd, + gen_helper_sve_ldffddu_zd, } } } +}; + +static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[a->ff][a->xs][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[a->ff][a->xs][a->u][a->msz]; + break; + } + assert(fn != NULL); + + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), fn); + return true; +} + +static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + TCGv_i64 imm; + + if (a->esz < a->msz || (a->esz == a->msz && !a->u)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[a->ff][0][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[a->ff][2][a->u][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + imm = tcg_const_i64(a->imm << a->msz); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn); + tcg_temp_free_i64(imm); + return true; +} + +/* Indexed by [xs][msz]. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][3] = { + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_zsu, + gen_helper_sve_stss_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_zss, + gen_helper_sve_stss_zss, }, +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn64[3][4] = { + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_zsu, + gen_helper_sve_stsd_zsu, + gen_helper_sve_stdd_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_zss, + gen_helper_sve_stsd_zss, + gen_helper_sve_stdd_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_zd, + gen_helper_sve_stsd_zd, + gen_helper_sve_stdd_zd, }, +}; + +static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn; + + if (a->esz < a->msz || (a->msz == 0 && a->scale)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[a->xs][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[a->xs][a->msz]; + break; + default: + g_assert_not_reached(); + } + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), fn); + return true; +} + +static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + TCGv_i64 imm; + + if (a->esz < a->msz) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[0][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[2][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + imm = tcg_const_i64(a->imm << a->msz); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn); + tcg_temp_free_i64(imm); + return true; +} + +/* + * Prefetches + */ + +static bool trans_PRF(DisasContext *s, arg_PRF *a, uint32_t insn) +{ + /* Prefetch is a nop within QEMU. */ + sve_access_check(s); + return true; +} + +static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + /* Prefetch is a nop within QEMU. */ + sve_access_check(s); + return true; +} + +/* + * Move Prefix + * + * TODO: The implementation so far could handle predicated merging movprfx. + * The helper functions as written take an extra source register to + * use in the operation, but the result is only written when predication + * succeeds. For unpredicated movprfx, we need to rearrange the helpers + * to allow the final write back to the destination to be unconditional. + * For predicated zeroing movprfx, we need to rearrange the helpers to + * allow the final write back to zero inactives. + * + * In the meantime, just emit the moves. + */ + +static bool trans_MOVPRFX(DisasContext *s, arg_MOVPRFX *a, uint32_t insn) +{ + return do_mov_z(s, a->rd, a->rn); +} + +static bool trans_MOVPRFX_m(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_sel_z(s, a->rd, a->rn, a->rd, a->pg, a->esz); + } + return true; +} + +static bool trans_MOVPRFX_z(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_movz_zpz(s, a->rd, a->rn, a->pg, a->esz); + } + return true; +} diff --git a/target/arm/translate.c b/target/arm/translate.c index 2a3e4f5d4c..f845da7c63 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -7762,9 +7762,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) */ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) { - gen_helper_gvec_3_ptr *fn_gvec_ptr; - int rd, rn, rm, rot, size, opr_sz; - TCGv_ptr fpst; + gen_helper_gvec_3 *fn_gvec = NULL; + gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL; + int rd, rn, rm, opr_sz; + int data = 0; bool q; q = extract32(insn, 6, 1); @@ -7777,8 +7778,8 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) if ((insn & 0xfe200f10) == 0xfc200800) { /* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */ - size = extract32(insn, 20, 1); - rot = extract32(insn, 23, 2); + int size = extract32(insn, 20, 1); + data = extract32(insn, 23, 2); /* rot */ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { return 1; @@ -7786,13 +7787,20 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) fn_gvec_ptr = size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah; } else if ((insn & 0xfea00f10) == 0xfc800800) { /* VCADD -- 1111 110R 1.0S .... .... 1000 ...0 .... */ - size = extract32(insn, 20, 1); - rot = extract32(insn, 24, 1); + int size = extract32(insn, 20, 1); + data = extract32(insn, 24, 1); /* rot */ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { return 1; } fn_gvec_ptr = size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh; + } else if ((insn & 0xfeb00f00) == 0xfc200d00) { + /* V[US]DOT -- 1111 1100 0.10 .... .... 1101 .Q.U .... */ + bool u = extract32(insn, 4, 1); + if (!arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) { + return 1; + } + fn_gvec = u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b; } else { return 1; } @@ -7807,12 +7815,19 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) } opr_sz = (1 + q) * 8; - fpst = get_fpstatus_ptr(1); - tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), - vfp_reg_offset(1, rn), - vfp_reg_offset(1, rm), fpst, - opr_sz, opr_sz, rot, fn_gvec_ptr); - tcg_temp_free_ptr(fpst); + if (fn_gvec_ptr) { + TCGv_ptr fpst = get_fpstatus_ptr(1); + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), fpst, + opr_sz, opr_sz, data, fn_gvec_ptr); + tcg_temp_free_ptr(fpst); + } else { + tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), + opr_sz, opr_sz, data, fn_gvec); + } return 0; } @@ -7826,26 +7841,52 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn) { - int rd, rn, rm, rot, size, opr_sz; - TCGv_ptr fpst; + gen_helper_gvec_3 *fn_gvec = NULL; + gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL; + int rd, rn, rm, opr_sz, data; bool q; q = extract32(insn, 6, 1); VFP_DREG_D(rd, insn); VFP_DREG_N(rn, insn); - VFP_DREG_M(rm, insn); if ((rd | rn) & q) { return 1; } if ((insn & 0xff000f10) == 0xfe000800) { /* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */ - rot = extract32(insn, 20, 2); - size = extract32(insn, 23, 1); - if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) - || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { + int rot = extract32(insn, 20, 2); + int size = extract32(insn, 23, 1); + int index; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)) { + return 1; + } + if (size == 0) { + if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) { + return 1; + } + /* For fp16, rm is just Vm, and index is M. */ + rm = extract32(insn, 0, 4); + index = extract32(insn, 5, 1); + } else { + /* For fp32, rm is the usual M:Vm, and index is 0. */ + VFP_DREG_M(rm, insn); + index = 0; + } + data = (index << 2) | rot; + fn_gvec_ptr = (size ? gen_helper_gvec_fcmlas_idx + : gen_helper_gvec_fcmlah_idx); + } else if ((insn & 0xffb00f00) == 0xfe200d00) { + /* V[US]DOT -- 1111 1110 0.10 .... .... 1101 .Q.U .... */ + int u = extract32(insn, 4, 1); + if (!arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) { return 1; } + fn_gvec = u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b; + /* rm is just Vm, and index is M. */ + data = extract32(insn, 5, 1); /* index */ + rm = extract32(insn, 0, 4); } else { return 1; } @@ -7860,14 +7901,19 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn) } opr_sz = (1 + q) * 8; - fpst = get_fpstatus_ptr(1); - tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), - vfp_reg_offset(1, rn), - vfp_reg_offset(1, rm), fpst, - opr_sz, opr_sz, rot, - size ? gen_helper_gvec_fcmlas_idx - : gen_helper_gvec_fcmlah_idx); - tcg_temp_free_ptr(fpst); + if (fn_gvec_ptr) { + TCGv_ptr fpst = get_fpstatus_ptr(1); + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), fpst, + opr_sz, opr_sz, data, fn_gvec_ptr); + tcg_temp_free_ptr(fpst); + } else { + tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), + opr_sz, opr_sz, data, fn_gvec); + } return 0; } diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index f504dd53c8..37f338732e 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -194,6 +194,197 @@ void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, clear_tail(d, opr_sz, simd_maxsz(desc)); } +/* Integer 8 and 16-bit dot-product. + * + * Note that for the loops herein, host endianness does not matter + * with respect to the ordering of data within the 64-bit lanes. + * All elements are treated equally, no matter where they are. + */ + +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint32_t *d = vd; + int8_t *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] += n[i * 4 + 0] * m[i * 4 + 0] + + n[i * 4 + 1] * m[i * 4 + 1] + + n[i * 4 + 2] * m[i * 4 + 2] + + n[i * 4 + 3] * m[i * 4 + 3]; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint32_t *d = vd; + uint8_t *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] += n[i * 4 + 0] * m[i * 4 + 0] + + n[i * 4 + 1] * m[i * 4 + 1] + + n[i * 4 + 2] * m[i * 4 + 2] + + n[i * 4 + 3] * m[i * 4 + 3]; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd; + int16_t *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] + + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] + + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] + + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc); + uint64_t *d = vd; + uint16_t *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; ++i) { + d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] + + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] + + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] + + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; + intptr_t index = simd_data(desc); + uint32_t *d = vd; + int8_t *n = vn; + int8_t *m_indexed = (int8_t *)vm + index * 4; + + /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. + * Otherwise opr_sz is a multiple of 16. + */ + segend = MIN(4, opr_sz_4); + i = 0; + do { + int8_t m0 = m_indexed[i * 4 + 0]; + int8_t m1 = m_indexed[i * 4 + 1]; + int8_t m2 = m_indexed[i * 4 + 2]; + int8_t m3 = m_indexed[i * 4 + 3]; + + do { + d[i] += n[i * 4 + 0] * m0 + + n[i * 4 + 1] * m1 + + n[i * 4 + 2] * m2 + + n[i * 4 + 3] * m3; + } while (++i < segend); + segend = i + 4; + } while (i < opr_sz_4); + + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; + intptr_t index = simd_data(desc); + uint32_t *d = vd; + uint8_t *n = vn; + uint8_t *m_indexed = (uint8_t *)vm + index * 4; + + /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. + * Otherwise opr_sz is a multiple of 16. + */ + segend = MIN(4, opr_sz_4); + i = 0; + do { + uint8_t m0 = m_indexed[i * 4 + 0]; + uint8_t m1 = m_indexed[i * 4 + 1]; + uint8_t m2 = m_indexed[i * 4 + 2]; + uint8_t m3 = m_indexed[i * 4 + 3]; + + do { + d[i] += n[i * 4 + 0] * m0 + + n[i * 4 + 1] * m1 + + n[i * 4 + 2] * m2 + + n[i * 4 + 3] * m3; + } while (++i < segend); + segend = i + 4; + } while (i < opr_sz_4); + + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; + intptr_t index = simd_data(desc); + uint64_t *d = vd; + int16_t *n = vn; + int16_t *m_indexed = (int16_t *)vm + index * 4; + + /* This is supported by SVE only, so opr_sz is always a multiple of 16. + * Process the entire segment all at once, writing back the results + * only after we've consumed all of the inputs. + */ + for (i = 0; i < opr_sz_8 ; i += 2) { + uint64_t d0, d1; + + d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; + d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; + d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; + d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; + d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; + d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; + d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; + d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; + + d[i + 0] += d0; + d[i + 1] += d1; + } + + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; + intptr_t index = simd_data(desc); + uint64_t *d = vd; + uint16_t *n = vn; + uint16_t *m_indexed = (uint16_t *)vm + index * 4; + + /* This is supported by SVE only, so opr_sz is always a multiple of 16. + * Process the entire segment all at once, writing back the results + * only after we've consumed all of the inputs. + */ + for (i = 0; i < opr_sz_8 ; i += 2) { + uint64_t d0, d1; + + d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; + d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; + d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; + d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; + d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; + d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; + d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; + d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; + + d[i + 0] += d0; + d[i + 1] += d1; + } + + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, void *vfpst, uint32_t desc) { @@ -317,23 +508,29 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, float_status *fpst = vfpst; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; - uintptr_t i; - float16 e1 = m[H2(flip)]; - float16 e3 = m[H2(1 - flip)]; + intptr_t elements = opr_sz / sizeof(float16); + intptr_t eltspersegment = 16 / sizeof(float16); + intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ neg_real <<= 15; neg_imag <<= 15; - e1 ^= neg_real; - e3 ^= neg_imag; - for (i = 0; i < opr_sz / 2; i += 2) { - float16 e2 = n[H2(i + flip)]; - float16 e4 = e2; + for (i = 0; i < elements; i += eltspersegment) { + float16 mr = m[H2(i + 2 * index + 0)]; + float16 mi = m[H2(i + 2 * index + 1)]; + float16 e1 = neg_real ^ (flip ? mi : mr); + float16 e3 = neg_imag ^ (flip ? mr : mi); - d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); - d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); + for (j = i; j < i + eltspersegment; j += 2) { + float16 e2 = n[H2(j + flip)]; + float16 e4 = e2; + + d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); + d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); + } } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -377,23 +574,29 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, float_status *fpst = vfpst; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); uint32_t neg_real = flip ^ neg_imag; - uintptr_t i; - float32 e1 = m[H4(flip)]; - float32 e3 = m[H4(1 - flip)]; + intptr_t elements = opr_sz / sizeof(float32); + intptr_t eltspersegment = 16 / sizeof(float32); + intptr_t i, j; /* Shift boolean to the sign bit so we can xor to negate. */ neg_real <<= 31; neg_imag <<= 31; - e1 ^= neg_real; - e3 ^= neg_imag; - for (i = 0; i < opr_sz / 4; i += 2) { - float32 e2 = n[H4(i + flip)]; - float32 e4 = e2; + for (i = 0; i < elements; i += eltspersegment) { + float32 mr = m[H4(i + 2 * index + 0)]; + float32 mi = m[H4(i + 2 * index + 1)]; + float32 e1 = neg_real ^ (flip ? mi : mr); + float32 e3 = neg_imag ^ (flip ? mr : mi); - d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); - d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); + for (j = i; j < i + eltspersegment; j += 2) { + float32 e2 = n[H4(j + flip)]; + float32 e4 = e2; + + d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); + d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); + } } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -427,6 +630,26 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, clear_tail(d, opr_sz, simd_maxsz(desc)); } +#define DO_2OP(NAME, FUNC, TYPE) \ +void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ +{ \ + intptr_t i, oprsz = simd_oprsz(desc); \ + TYPE *d = vd, *n = vn; \ + for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ + d[i] = FUNC(n[i], stat); \ + } \ +} + +DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) +DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) +DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) + +DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) +DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) +DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) + +#undef DO_2OP + /* Floating-point trigonometric starting value. * See the ARM ARM pseudocode function FPTrigSMul. */ @@ -495,3 +718,51 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) #endif #undef DO_3OP + +/* For the indexed ops, SVE applies the index per 128-bit vector segment. + * For AdvSIMD, there is of course only one such vector segment. + */ + +#define DO_MUL_IDX(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ + intptr_t idx = simd_data(desc); \ + TYPE *d = vd, *n = vn, *m = vm; \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ + } \ + } \ +} + +DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) +DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) +DO_MUL_IDX(gvec_fmul_idx_d, float64, ) + +#undef DO_MUL_IDX + +#define DO_FMLA_IDX(NAME, TYPE, H) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ + void *stat, uint32_t desc) \ +{ \ + intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ + TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ + intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ + TYPE *d = vd, *n = vn, *m = vm, *a = va; \ + op1_neg <<= (8 * sizeof(TYPE) - 1); \ + for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ + TYPE mm = m[H(i + idx)]; \ + for (j = 0; j < segment; j++) { \ + d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ + mm, a[i + j], 0, stat); \ + } \ + } \ +} + +DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) +DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) +DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) + +#undef DO_FMLA_IDX |