summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2018-06-30 11:55:32 +0100
committerPeter Maydell <peter.maydell@linaro.org>2018-06-30 11:55:32 +0100
commit275845ae65fdfe1f84484fd1d2ca274ce80d7aaf (patch)
treec8fe8c6c211008a9335b656235ab9187ffa524b1
parentce59ecc411fcde327e35364411f9e9ebbf96cab1 (diff)
parent802abf4024d23e48d45373ac3f2b580124b54b47 (diff)
downloadqemu-275845ae65fdfe1f84484fd1d2ca274ce80d7aaf.zip
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20180629' into staging
target-arm queue: * last of the SVE patches; SVE is now enabled for aarch64 linux-user * sd: Don't trace SDRequest crc field (coverity bugfix) * target/arm: Mark PMINTENSET accesses as possibly doing IO * clean up v7VE feature bit handling * i.mx7d: minor cleanups * target/arm: support reading of CNT[VCT|FRQ]_EL0 from user-space * target/arm: Implement ARMv8.2-DotProd * virt: add addresses to dt node names (which stops dtc from complaining that they're not correctly named) * cleanups: replace error_setg(&error_fatal) by error_report() + exit() # gpg: Signature made Fri 29 Jun 2018 15:52:21 BST # gpg: using RSA key 3C2525ED14360CDE # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" # gpg: aka "Peter Maydell <pmaydell@gmail.com>" # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20180629: (55 commits) target/arm: Add ID_ISAR6 target/arm: Prune a15 features from max target/arm: Prune a57 features from max target/arm: Fix SVE system register access checks target/arm: Fix SVE signed division vs x86 overflow exception sdcard: Use the ldst API sd: Don't trace SDRequest crc field target/arm: Mark PMINTENSET accesses as possibly doing IO target/arm: Remove redundant DIV detection for KVM target/arm: Add ARM_FEATURE_V7VE for v7 Virtualization Extensions i.mx7d: Change IRQ number type from hwaddr to int i.mx7d: Change SRC unimplemented device name from sdma to src i.mx7d: Remove unused header files target/arm: support reading of CNT[VCT|FRQ]_EL0 from user-space target/arm: Implement ARMv8.2-DotProd target/arm: Enable SVE for aarch64-linux-user target/arm: Implement SVE dot product (indexed) target/arm: Implement SVE dot product (vectors) target/arm: Implement SVE fp complex multiply add (indexed) target/arm: Pass index to AdvSIMD FCMLA (indexed) ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--device_tree.c78
-rw-r--r--hw/arm/boot.c41
-rw-r--r--hw/arm/fsl-imx7.c8
-rw-r--r--hw/arm/mcimx7d-sabre.c2
-rw-r--r--hw/arm/sysbus-fdt.c53
-rw-r--r--hw/arm/virt.c70
-rw-r--r--hw/block/fdc.c9
-rw-r--r--hw/sd/bcm2835_sdhost.c13
-rw-r--r--hw/sd/core.c2
-rw-r--r--hw/sd/milkymist-memcard.c3
-rw-r--r--hw/sd/omap_mmc.c6
-rw-r--r--hw/sd/pl181.c11
-rw-r--r--hw/sd/sdhci.c15
-rw-r--r--hw/sd/ssi-sd.c6
-rw-r--r--hw/sd/trace-events2
-rw-r--r--include/sysemu/device_tree.h16
-rw-r--r--linux-user/elfload.c2
-rw-r--r--target/arm/cpu.c36
-rw-r--r--target/arm/cpu.h3
-rw-r--r--target/arm/cpu64.c13
-rw-r--r--target/arm/helper-sve.h682
-rw-r--r--target/arm/helper.c44
-rw-r--r--target/arm/helper.h44
-rw-r--r--target/arm/kvm32.c27
-rw-r--r--target/arm/sve.decode427
-rw-r--r--target/arm/sve_helper.c1875
-rw-r--r--target/arm/translate-a64.c62
-rw-r--r--target/arm/translate-sve.c1688
-rw-r--r--target/arm/translate.c102
-rw-r--r--target/arm/vec_helper.c311
30 files changed, 5394 insertions, 257 deletions
diff --git a/device_tree.c b/device_tree.c
index 52c3358a55..6d9c9726f6 100644
--- a/device_tree.c
+++ b/device_tree.c
@@ -140,15 +140,16 @@ static void read_fstree(void *fdt, const char *dirname)
const char *parent_node;
if (strstr(dirname, root_dir) != dirname) {
- error_setg(&error_fatal, "%s: %s must be searched within %s",
- __func__, dirname, root_dir);
+ error_report("%s: %s must be searched within %s",
+ __func__, dirname, root_dir);
+ exit(1);
}
parent_node = &dirname[strlen(SYSFS_DT_BASEDIR)];
d = opendir(dirname);
if (!d) {
- error_setg(&error_fatal, "%s cannot open %s", __func__, dirname);
- return;
+ error_report("%s cannot open %s", __func__, dirname);
+ exit(1);
}
while ((de = readdir(d)) != NULL) {
@@ -162,7 +163,8 @@ static void read_fstree(void *fdt, const char *dirname)
tmpnam = g_strdup_printf("%s/%s", dirname, de->d_name);
if (lstat(tmpnam, &st) < 0) {
- error_setg(&error_fatal, "%s cannot lstat %s", __func__, tmpnam);
+ error_report("%s cannot lstat %s", __func__, tmpnam);
+ exit(1);
}
if (S_ISREG(st.st_mode)) {
@@ -170,8 +172,9 @@ static void read_fstree(void *fdt, const char *dirname)
gsize len;
if (!g_file_get_contents(tmpnam, &val, &len, NULL)) {
- error_setg(&error_fatal, "%s not able to extract info from %s",
- __func__, tmpnam);
+ error_report("%s not able to extract info from %s",
+ __func__, tmpnam);
+ exit(1);
}
if (strlen(parent_node) > 0) {
@@ -206,9 +209,9 @@ void *load_device_tree_from_sysfs(void)
host_fdt = create_device_tree(&host_fdt_size);
read_fstree(host_fdt, SYSFS_DT_BASEDIR);
if (fdt_check_header(host_fdt)) {
- error_setg(&error_fatal,
- "%s host device tree extracted into memory is invalid",
- __func__);
+ error_report("%s host device tree extracted into memory is invalid",
+ __func__);
+ exit(1);
}
return host_fdt;
}
@@ -229,6 +232,61 @@ static int findnode_nofail(void *fdt, const char *node_path)
return offset;
}
+char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp)
+{
+ char *prefix = g_strdup_printf("%s@", name);
+ unsigned int path_len = 16, n = 0;
+ GSList *path_list = NULL, *iter;
+ const char *iter_name;
+ int offset, len, ret;
+ char **path_array;
+
+ offset = fdt_next_node(fdt, -1, NULL);
+
+ while (offset >= 0) {
+ iter_name = fdt_get_name(fdt, offset, &len);
+ if (!iter_name) {
+ offset = len;
+ break;
+ }
+ if (!strcmp(iter_name, name) || g_str_has_prefix(iter_name, prefix)) {
+ char *path;
+
+ path = g_malloc(path_len);
+ while ((ret = fdt_get_path(fdt, offset, path, path_len))
+ == -FDT_ERR_NOSPACE) {
+ path_len += 16;
+ path = g_realloc(path, path_len);
+ }
+ path_list = g_slist_prepend(path_list, path);
+ n++;
+ }
+ offset = fdt_next_node(fdt, offset, NULL);
+ }
+ g_free(prefix);
+
+ if (offset < 0 && offset != -FDT_ERR_NOTFOUND) {
+ error_setg(errp, "%s: abort parsing dt for %s node units: %s",
+ __func__, name, fdt_strerror(offset));
+ for (iter = path_list; iter; iter = iter->next) {
+ g_free(iter->data);
+ }
+ g_slist_free(path_list);
+ return NULL;
+ }
+
+ path_array = g_new(char *, n + 1);
+ path_array[n--] = NULL;
+
+ for (iter = path_list; iter; iter = iter->next) {
+ path_array[n--] = iter->data;
+ }
+
+ g_slist_free(path_list);
+
+ return path_array;
+}
+
char **qemu_fdt_node_path(void *fdt, const char *name, char *compat,
Error **errp)
{
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index 1e481662ad..e09201cc97 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -490,11 +490,13 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
hwaddr addr_limit, AddressSpace *as)
{
void *fdt = NULL;
- int size, rc;
+ int size, rc, n = 0;
uint32_t acells, scells;
char *nodename;
unsigned int i;
hwaddr mem_base, mem_len;
+ char **node_path;
+ Error *err = NULL;
if (binfo->dtb_filename) {
char *filename;
@@ -546,12 +548,21 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
goto fail;
}
+ /* nop all root nodes matching /memory or /memory@unit-address */
+ node_path = qemu_fdt_node_unit_path(fdt, "memory", &err);
+ if (err) {
+ error_report_err(err);
+ goto fail;
+ }
+ while (node_path[n]) {
+ if (g_str_has_prefix(node_path[n], "/memory")) {
+ qemu_fdt_nop_node(fdt, node_path[n]);
+ }
+ n++;
+ }
+ g_strfreev(node_path);
+
if (nb_numa_nodes > 0) {
- /*
- * Turn the /memory node created before into a NOP node, then create
- * /memory@addr nodes for all numa nodes respectively.
- */
- qemu_fdt_nop_node(fdt, "/memory");
mem_base = binfo->loader_start;
for (i = 0; i < nb_numa_nodes; i++) {
mem_len = numa_info[i].node_mem;
@@ -572,24 +583,18 @@ int arm_load_dtb(hwaddr addr, const struct arm_boot_info *binfo,
g_free(nodename);
}
} else {
- Error *err = NULL;
-
- rc = fdt_path_offset(fdt, "/memory");
- if (rc < 0) {
- qemu_fdt_add_subnode(fdt, "/memory");
- }
-
- if (!qemu_fdt_getprop(fdt, "/memory", "device_type", NULL, &err)) {
- qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory");
- }
+ nodename = g_strdup_printf("/memory@%" PRIx64, binfo->loader_start);
+ qemu_fdt_add_subnode(fdt, nodename);
+ qemu_fdt_setprop_string(fdt, nodename, "device_type", "memory");
- rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg",
+ rc = qemu_fdt_setprop_sized_cells(fdt, nodename, "reg",
acells, binfo->loader_start,
scells, binfo->ram_size);
if (rc < 0) {
- fprintf(stderr, "couldn't set /memory/reg\n");
+ fprintf(stderr, "couldn't set %s reg\n", nodename);
goto fail;
}
+ g_free(nodename);
}
rc = fdt_path_offset(fdt, "/chosen");
diff --git a/hw/arm/fsl-imx7.c b/hw/arm/fsl-imx7.c
index 26c1d27f7c..44fde03cbe 100644
--- a/hw/arm/fsl-imx7.c
+++ b/hw/arm/fsl-imx7.c
@@ -324,7 +324,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp)
FSL_IMX7_ECSPI4_ADDR,
};
- static const hwaddr FSL_IMX7_SPIn_IRQ[FSL_IMX7_NUM_ECSPIS] = {
+ static const int FSL_IMX7_SPIn_IRQ[FSL_IMX7_NUM_ECSPIS] = {
FSL_IMX7_ECSPI1_IRQ,
FSL_IMX7_ECSPI2_IRQ,
FSL_IMX7_ECSPI3_IRQ,
@@ -349,7 +349,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp)
FSL_IMX7_I2C4_ADDR,
};
- static const hwaddr FSL_IMX7_I2Cn_IRQ[FSL_IMX7_NUM_I2CS] = {
+ static const int FSL_IMX7_I2Cn_IRQ[FSL_IMX7_NUM_I2CS] = {
FSL_IMX7_I2C1_IRQ,
FSL_IMX7_I2C2_IRQ,
FSL_IMX7_I2C3_IRQ,
@@ -459,7 +459,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp)
/*
* SRC
*/
- create_unimplemented_device("sdma", FSL_IMX7_SRC_ADDR, FSL_IMX7_SRC_SIZE);
+ create_unimplemented_device("src", FSL_IMX7_SRC_ADDR, FSL_IMX7_SRC_SIZE);
/*
* Watchdog
@@ -515,7 +515,7 @@ static void fsl_imx7_realize(DeviceState *dev, Error **errp)
FSL_IMX7_USB3_ADDR,
};
- static const hwaddr FSL_IMX7_USBn_IRQ[FSL_IMX7_NUM_USBS] = {
+ static const int FSL_IMX7_USBn_IRQ[FSL_IMX7_NUM_USBS] = {
FSL_IMX7_USB1_IRQ,
FSL_IMX7_USB2_IRQ,
FSL_IMX7_USB3_IRQ,
diff --git a/hw/arm/mcimx7d-sabre.c b/hw/arm/mcimx7d-sabre.c
index 95fb409d9c..9c5f0e70c3 100644
--- a/hw/arm/mcimx7d-sabre.c
+++ b/hw/arm/mcimx7d-sabre.c
@@ -18,10 +18,8 @@
#include "hw/arm/fsl-imx7.h"
#include "hw/boards.h"
#include "sysemu/sysemu.h"
-#include "sysemu/device_tree.h"
#include "qemu/error-report.h"
#include "sysemu/qtest.h"
-#include "net/net.h"
typedef struct {
FslIMX7State soc;
diff --git a/hw/arm/sysbus-fdt.c b/hw/arm/sysbus-fdt.c
index 277ed872e7..0d4c75702c 100644
--- a/hw/arm/sysbus-fdt.c
+++ b/hw/arm/sysbus-fdt.c
@@ -92,16 +92,20 @@ static void copy_properties_from_host(HostProperty *props, int nb_props,
r = qemu_fdt_getprop(host_fdt, node_path,
props[i].name,
&prop_len,
- props[i].optional ? &err : &error_fatal);
+ &err);
if (r) {
qemu_fdt_setprop(guest_fdt, nodename,
props[i].name, r, prop_len);
} else {
- if (prop_len != -FDT_ERR_NOTFOUND) {
- /* optional property not returned although property exists */
- error_report_err(err);
- } else {
+ if (props[i].optional && prop_len == -FDT_ERR_NOTFOUND) {
+ /* optional property does not exist */
error_free(err);
+ } else {
+ error_report_err(err);
+ }
+ if (!props[i].optional) {
+ /* mandatory property not found: bail out */
+ exit(1);
}
}
}
@@ -138,9 +142,9 @@ static void fdt_build_clock_node(void *host_fdt, void *guest_fdt,
node_offset = fdt_node_offset_by_phandle(host_fdt, host_phandle);
if (node_offset <= 0) {
- error_setg(&error_fatal,
- "not able to locate clock handle %d in host device tree",
- host_phandle);
+ error_report("not able to locate clock handle %d in host device tree",
+ host_phandle);
+ exit(1);
}
node_path = g_malloc(path_len);
while ((ret = fdt_get_path(host_fdt, node_offset, node_path, path_len))
@@ -149,16 +153,16 @@ static void fdt_build_clock_node(void *host_fdt, void *guest_fdt,
node_path = g_realloc(node_path, path_len);
}
if (ret < 0) {
- error_setg(&error_fatal,
- "not able to retrieve node path for clock handle %d",
- host_phandle);
+ error_report("not able to retrieve node path for clock handle %d",
+ host_phandle);
+ exit(1);
}
r = qemu_fdt_getprop(host_fdt, node_path, "compatible", &prop_len,
&error_fatal);
if (strcmp(r, "fixed-clock")) {
- error_setg(&error_fatal,
- "clock handle %d is not a fixed clock", host_phandle);
+ error_report("clock handle %d is not a fixed clock", host_phandle);
+ exit(1);
}
nodename = strrchr(node_path, '/');
@@ -301,34 +305,37 @@ static int add_amd_xgbe_fdt_node(SysBusDevice *sbdev, void *opaque)
dt_name = sysfs_to_dt_name(vbasedev->name);
if (!dt_name) {
- error_setg(&error_fatal, "%s incorrect sysfs device name %s",
- __func__, vbasedev->name);
+ error_report("%s incorrect sysfs device name %s",
+ __func__, vbasedev->name);
+ exit(1);
}
node_path = qemu_fdt_node_path(host_fdt, dt_name, vdev->compat,
&error_fatal);
if (!node_path || !node_path[0]) {
- error_setg(&error_fatal, "%s unable to retrieve node path for %s/%s",
- __func__, dt_name, vdev->compat);
+ error_report("%s unable to retrieve node path for %s/%s",
+ __func__, dt_name, vdev->compat);
+ exit(1);
}
if (node_path[1]) {
- error_setg(&error_fatal, "%s more than one node matching %s/%s!",
- __func__, dt_name, vdev->compat);
+ error_report("%s more than one node matching %s/%s!",
+ __func__, dt_name, vdev->compat);
+ exit(1);
}
g_free(dt_name);
if (vbasedev->num_regions != 5) {
- error_setg(&error_fatal, "%s Does the host dt node combine XGBE/PHY?",
- __func__);
+ error_report("%s Does the host dt node combine XGBE/PHY?", __func__);
+ exit(1);
}
/* generate nodes for DMA_CLK and PTP_CLK */
r = qemu_fdt_getprop(host_fdt, node_path[0], "clocks",
&prop_len, &error_fatal);
if (prop_len != 8) {
- error_setg(&error_fatal, "%s clocks property should contain 2 handles",
- __func__);
+ error_report("%s clocks property should contain 2 handles", __func__);
+ exit(1);
}
host_clock_phandles = (uint32_t *)r;
guest_clock_phandles[0] = qemu_fdt_alloc_phandle(guest_fdt);
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 742f68afca..281ddcdf6e 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -204,13 +204,8 @@ static void create_fdt(VirtMachineState *vms)
qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 0x2);
qemu_fdt_setprop_cell(fdt, "/", "#size-cells", 0x2);
- /*
- * /chosen and /memory nodes must exist for load_dtb
- * to fill in necessary properties later
- */
+ /* /chosen must exist for load_dtb to fill in necessary properties later */
qemu_fdt_add_subnode(fdt, "/chosen");
- qemu_fdt_add_subnode(fdt, "/memory");
- qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory");
/* Clock node, for the benefit of the UART. The kernel device tree
* binding documentation claims the PL011 node clock properties are
@@ -369,58 +364,72 @@ static void fdt_add_cpu_nodes(const VirtMachineState *vms)
static void fdt_add_its_gic_node(VirtMachineState *vms)
{
+ char *nodename;
+
vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt);
- qemu_fdt_add_subnode(vms->fdt, "/intc/its");
- qemu_fdt_setprop_string(vms->fdt, "/intc/its", "compatible",
+ nodename = g_strdup_printf("/intc/its@%" PRIx64,
+ vms->memmap[VIRT_GIC_ITS].base);
+ qemu_fdt_add_subnode(vms->fdt, nodename);
+ qemu_fdt_setprop_string(vms->fdt, nodename, "compatible",
"arm,gic-v3-its");
- qemu_fdt_setprop(vms->fdt, "/intc/its", "msi-controller", NULL, 0);
- qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/its", "reg",
+ qemu_fdt_setprop(vms->fdt, nodename, "msi-controller", NULL, 0);
+ qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
2, vms->memmap[VIRT_GIC_ITS].base,
2, vms->memmap[VIRT_GIC_ITS].size);
- qemu_fdt_setprop_cell(vms->fdt, "/intc/its", "phandle", vms->msi_phandle);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->msi_phandle);
+ g_free(nodename);
}
static void fdt_add_v2m_gic_node(VirtMachineState *vms)
{
+ char *nodename;
+
+ nodename = g_strdup_printf("/intc/v2m@%" PRIx64,
+ vms->memmap[VIRT_GIC_V2M].base);
vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt);
- qemu_fdt_add_subnode(vms->fdt, "/intc/v2m");
- qemu_fdt_setprop_string(vms->fdt, "/intc/v2m", "compatible",
+ qemu_fdt_add_subnode(vms->fdt, nodename);
+ qemu_fdt_setprop_string(vms->fdt, nodename, "compatible",
"arm,gic-v2m-frame");
- qemu_fdt_setprop(vms->fdt, "/intc/v2m", "msi-controller", NULL, 0);
- qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/v2m", "reg",
+ qemu_fdt_setprop(vms->fdt, nodename, "msi-controller", NULL, 0);
+ qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
2, vms->memmap[VIRT_GIC_V2M].base,
2, vms->memmap[VIRT_GIC_V2M].size);
- qemu_fdt_setprop_cell(vms->fdt, "/intc/v2m", "phandle", vms->msi_phandle);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->msi_phandle);
+ g_free(nodename);
}
static void fdt_add_gic_node(VirtMachineState *vms)
{
+ char *nodename;
+
vms->gic_phandle = qemu_fdt_alloc_phandle(vms->fdt);
qemu_fdt_setprop_cell(vms->fdt, "/", "interrupt-parent", vms->gic_phandle);
- qemu_fdt_add_subnode(vms->fdt, "/intc");
- qemu_fdt_setprop_cell(vms->fdt, "/intc", "#interrupt-cells", 3);
- qemu_fdt_setprop(vms->fdt, "/intc", "interrupt-controller", NULL, 0);
- qemu_fdt_setprop_cell(vms->fdt, "/intc", "#address-cells", 0x2);
- qemu_fdt_setprop_cell(vms->fdt, "/intc", "#size-cells", 0x2);
- qemu_fdt_setprop(vms->fdt, "/intc", "ranges", NULL, 0);
+ nodename = g_strdup_printf("/intc@%" PRIx64,
+ vms->memmap[VIRT_GIC_DIST].base);
+ qemu_fdt_add_subnode(vms->fdt, nodename);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "#interrupt-cells", 3);
+ qemu_fdt_setprop(vms->fdt, nodename, "interrupt-controller", NULL, 0);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "#address-cells", 0x2);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "#size-cells", 0x2);
+ qemu_fdt_setprop(vms->fdt, nodename, "ranges", NULL, 0);
if (vms->gic_version == 3) {
int nb_redist_regions = virt_gicv3_redist_region_count(vms);
- qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible",
+ qemu_fdt_setprop_string(vms->fdt, nodename, "compatible",
"arm,gic-v3");
- qemu_fdt_setprop_cell(vms->fdt, "/intc",
+ qemu_fdt_setprop_cell(vms->fdt, nodename,
"#redistributor-regions", nb_redist_regions);
if (nb_redist_regions == 1) {
- qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg",
+ qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
2, vms->memmap[VIRT_GIC_DIST].base,
2, vms->memmap[VIRT_GIC_DIST].size,
2, vms->memmap[VIRT_GIC_REDIST].base,
2, vms->memmap[VIRT_GIC_REDIST].size);
} else {
- qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg",
+ qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
2, vms->memmap[VIRT_GIC_DIST].base,
2, vms->memmap[VIRT_GIC_DIST].size,
2, vms->memmap[VIRT_GIC_REDIST].base,
@@ -430,22 +439,23 @@ static void fdt_add_gic_node(VirtMachineState *vms)
}
if (vms->virt) {
- qemu_fdt_setprop_cells(vms->fdt, "/intc", "interrupts",
+ qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts",
GIC_FDT_IRQ_TYPE_PPI, ARCH_GICV3_MAINT_IRQ,
GIC_FDT_IRQ_FLAGS_LEVEL_HI);
}
} else {
/* 'cortex-a15-gic' means 'GIC v2' */
- qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible",
+ qemu_fdt_setprop_string(vms->fdt, nodename, "compatible",
"arm,cortex-a15-gic");
- qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg",
+ qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg",
2, vms->memmap[VIRT_GIC_DIST].base,
2, vms->memmap[VIRT_GIC_DIST].size,
2, vms->memmap[VIRT_GIC_CPU].base,
2, vms->memmap[VIRT_GIC_CPU].size);
}
- qemu_fdt_setprop_cell(vms->fdt, "/intc", "phandle", vms->gic_phandle);
+ qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", vms->gic_phandle);
+ g_free(nodename);
}
static void fdt_add_pmu_nodes(const VirtMachineState *vms)
diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index cd29e27d8f..7c1c57f57f 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -396,16 +396,9 @@ static int pick_geometry(FDrive *drv)
nb_sectors,
FloppyDriveType_str(parse->drive));
}
+ assert(type_match != -1 && "misconfigured fd_format");
match = type_match;
}
-
- /* No match of any kind found -- fd_format is misconfigured, abort. */
- if (match == -1) {
- error_setg(&error_abort, "No candidate geometries present in table "
- " for floppy drive type '%s'",
- FloppyDriveType_str(drv->drive));
- }
-
parse = &(fd_formats[match]);
out:
diff --git a/hw/sd/bcm2835_sdhost.c b/hw/sd/bcm2835_sdhost.c
index ebf3b926c2..4df4de7d67 100644
--- a/hw/sd/bcm2835_sdhost.c
+++ b/hw/sd/bcm2835_sdhost.c
@@ -118,8 +118,6 @@ static void bcm2835_sdhost_send_command(BCM2835SDHostState *s)
goto error;
}
if (!(s->cmd & SDCMD_NO_RESPONSE)) {
-#define RWORD(n) (((uint32_t)rsp[n] << 24) | (rsp[n + 1] << 16) \
- | (rsp[n + 2] << 8) | rsp[n + 3])
if (rlen == 0 || (rlen == 4 && (s->cmd & SDCMD_LONG_RESPONSE))) {
goto error;
}
@@ -127,15 +125,14 @@ static void bcm2835_sdhost_send_command(BCM2835SDHostState *s)
goto error;
}
if (rlen == 4) {
- s->rsp[0] = RWORD(0);
+ s->rsp[0] = ldl_be_p(&rsp[0]);
s->rsp[1] = s->rsp[2] = s->rsp[3] = 0;
} else {
- s->rsp[0] = RWORD(12);
- s->rsp[1] = RWORD(8);
- s->rsp[2] = RWORD(4);
- s->rsp[3] = RWORD(0);
+ s->rsp[0] = ldl_be_p(&rsp[12]);
+ s->rsp[1] = ldl_be_p(&rsp[8]);
+ s->rsp[2] = ldl_be_p(&rsp[4]);
+ s->rsp[3] = ldl_be_p(&rsp[0]);
}
-#undef RWORD
}
/* We never really delay commands, so if this was a 'busywait' command
* then we've completed it now and can raise the interrupt.
diff --git a/hw/sd/core.c b/hw/sd/core.c
index 820345f704..107e6d71dd 100644
--- a/hw/sd/core.c
+++ b/hw/sd/core.c
@@ -91,7 +91,7 @@ int sdbus_do_command(SDBus *sdbus, SDRequest *req, uint8_t *response)
{
SDState *card = get_card(sdbus);
- trace_sdbus_command(sdbus_name(sdbus), req->cmd, req->arg, req->crc);
+ trace_sdbus_command(sdbus_name(sdbus), req->cmd, req->arg);
if (card) {
SDCardClass *sc = SD_CARD_GET_CLASS(card);
diff --git a/hw/sd/milkymist-memcard.c b/hw/sd/milkymist-memcard.c
index fcbccf54ea..df42aa1c54 100644
--- a/hw/sd/milkymist-memcard.c
+++ b/hw/sd/milkymist-memcard.c
@@ -100,8 +100,7 @@ static void memcard_sd_command(MilkymistMemcardState *s)
SDRequest req;
req.cmd = s->command[0] & 0x3f;
- req.arg = (s->command[1] << 24) | (s->command[2] << 16)
- | (s->command[3] << 8) | s->command[4];
+ req.arg = ldl_be_p(s->command + 1);
req.crc = s->command[5];
s->response[0] = req.cmd;
diff --git a/hw/sd/omap_mmc.c b/hw/sd/omap_mmc.c
index aa2a816f76..671264b650 100644
--- a/hw/sd/omap_mmc.c
+++ b/hw/sd/omap_mmc.c
@@ -163,8 +163,7 @@ static void omap_mmc_command(struct omap_mmc_s *host, int cmd, int dir,
CID_CSD_OVERWRITE;
if (host->sdio & (1 << 13))
mask |= AKE_SEQ_ERROR;
- rspstatus = (response[0] << 24) | (response[1] << 16) |
- (response[2] << 8) | (response[3] << 0);
+ rspstatus = ldl_be_p(response);
break;
case sd_r2:
@@ -182,8 +181,7 @@ static void omap_mmc_command(struct omap_mmc_s *host, int cmd, int dir,
}
rsplen = 4;
- rspstatus = (response[0] << 24) | (response[1] << 16) |
- (response[2] << 8) | (response[3] << 0);
+ rspstatus = ldl_be_p(response);
if (rspstatus & 0x80000000)
host->status &= 0xe000;
else
diff --git a/hw/sd/pl181.c b/hw/sd/pl181.c
index 1cc94dbfdf..3ad7e925c5 100644
--- a/hw/sd/pl181.c
+++ b/hw/sd/pl181.c
@@ -182,23 +182,20 @@ static void pl181_send_command(PL181State *s)
if (rlen < 0)
goto error;
if (s->cmd & PL181_CMD_RESPONSE) {
-#define RWORD(n) (((uint32_t)response[n] << 24) | (response[n + 1] << 16) \
- | (response[n + 2] << 8) | response[n + 3])
if (rlen == 0 || (rlen == 4 && (s->cmd & PL181_CMD_LONGRESP)))
goto error;
if (rlen != 4 && rlen != 16)
goto error;
- s->response[0] = RWORD(0);
+ s->response[0] = ldl_be_p(&response[0]);
if (rlen == 4) {
s->response[1] = s->response[2] = s->response[3] = 0;
} else {
- s->response[1] = RWORD(4);
- s->response[2] = RWORD(8);
- s->response[3] = RWORD(12) & ~1;
+ s->response[1] = ldl_be_p(&response[4]);
+ s->response[2] = ldl_be_p(&response[8]);
+ s->response[3] = ldl_be_p(&response[12]) & ~1;
}
DPRINTF("Response received\n");
s->status |= PL181_STATUS_CMDRESPEND;
-#undef RWORD
} else {
DPRINTF("Command sent\n");
s->status |= PL181_STATUS_CMDSENT;
diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 3017e5a95a..321d02d75a 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -342,17 +342,13 @@ static void sdhci_send_command(SDHCIState *s)
if (s->cmdreg & SDHC_CMD_RESPONSE) {
if (rlen == 4) {
- s->rspreg[0] = (response[0] << 24) | (response[1] << 16) |
- (response[2] << 8) | response[3];
+ s->rspreg[0] = ldl_be_p(response);
s->rspreg[1] = s->rspreg[2] = s->rspreg[3] = 0;
trace_sdhci_response4(s->rspreg[0]);
} else if (rlen == 16) {
- s->rspreg[0] = (response[11] << 24) | (response[12] << 16) |
- (response[13] << 8) | response[14];
- s->rspreg[1] = (response[7] << 24) | (response[8] << 16) |
- (response[9] << 8) | response[10];
- s->rspreg[2] = (response[3] << 24) | (response[4] << 16) |
- (response[5] << 8) | response[6];
+ s->rspreg[0] = ldl_be_p(&response[11]);
+ s->rspreg[1] = ldl_be_p(&response[7]);
+ s->rspreg[2] = ldl_be_p(&response[3]);
s->rspreg[3] = (response[0] << 16) | (response[1] << 8) |
response[2];
trace_sdhci_response16(s->rspreg[3], s->rspreg[2],
@@ -396,8 +392,7 @@ static void sdhci_end_transfer(SDHCIState *s)
trace_sdhci_end_transfer(request.cmd, request.arg);
sdbus_do_command(&s->sdbus, &request, response);
/* Auto CMD12 response goes to the upper Response register */
- s->rspreg[3] = (response[0] << 24) | (response[1] << 16) |
- (response[2] << 8) | response[3];
+ s->rspreg[3] = ldl_be_p(response);
}
s->prnsts &= ~(SDHC_DOING_READ | SDHC_DOING_WRITE |
diff --git a/hw/sd/ssi-sd.c b/hw/sd/ssi-sd.c
index 96542ecd62..95a143bfba 100644
--- a/hw/sd/ssi-sd.c
+++ b/hw/sd/ssi-sd.c
@@ -96,8 +96,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val)
uint8_t longresp[16];
/* FIXME: Check CRC. */
request.cmd = s->cmd;
- request.arg = (s->cmdarg[0] << 24) | (s->cmdarg[1] << 16)
- | (s->cmdarg[2] << 8) | s->cmdarg[3];
+ request.arg = ldl_be_p(s->cmdarg);
DPRINTF("CMD%d arg 0x%08x\n", s->cmd, request.arg);
s->arglen = sdbus_do_command(&s->sdbus, &request, longresp);
if (s->arglen <= 0) {
@@ -122,8 +121,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val)
/* CMD13 returns a 2-byte statuse work. Other commands
only return the first byte. */
s->arglen = (s->cmd == 13) ? 2 : 1;
- cardstatus = (longresp[0] << 24) | (longresp[1] << 16)
- | (longresp[2] << 8) | longresp[3];
+ cardstatus = ldl_be_p(longresp);
status = 0;
if (((cardstatus >> 9) & 0xf) < 4)
status |= SSI_SDR_IDLE;
diff --git a/hw/sd/trace-events b/hw/sd/trace-events
index bfd1d62efc..43cffab8b1 100644
--- a/hw/sd/trace-events
+++ b/hw/sd/trace-events
@@ -7,7 +7,7 @@ bcm2835_sdhost_edm_change(const char *why, uint32_t edm) "(%s) EDM now 0x%x"
bcm2835_sdhost_update_irq(uint32_t irq) "IRQ bits 0x%x\n"
# hw/sd/core.c
-sdbus_command(const char *bus_name, uint8_t cmd, uint32_t arg, uint8_t crc) "@%s CMD%02d arg 0x%08x crc 0x%02x"
+sdbus_command(const char *bus_name, uint8_t cmd, uint32_t arg) "@%s CMD%02d arg 0x%08x"
sdbus_read(const char *bus_name, uint8_t value) "@%s value 0x%02x"
sdbus_write(const char *bus_name, uint8_t value) "@%s value 0x%02x"
sdbus_set_voltage(const char *bus_name, uint16_t millivolts) "@%s %u (mV)"
diff --git a/include/sysemu/device_tree.h b/include/sysemu/device_tree.h
index e22e5bec9c..c16fd69bc0 100644
--- a/include/sysemu/device_tree.h
+++ b/include/sysemu/device_tree.h
@@ -43,6 +43,22 @@ void *load_device_tree_from_sysfs(void);
char **qemu_fdt_node_path(void *fdt, const char *name, char *compat,
Error **errp);
+/**
+ * qemu_fdt_node_unit_path: return the paths of nodes matching a given
+ * node-name, ie. node-name and node-name@unit-address
+ * @fdt: pointer to the dt blob
+ * @name: node name
+ * @errp: handle to an error object
+ *
+ * returns a newly allocated NULL-terminated array of node paths.
+ * Use g_strfreev() to free it. If one or more nodes were found, the
+ * array contains the path of each node and the last element equals to
+ * NULL. If there is no error but no matching node was found, the
+ * returned array contains a single element equal to NULL. If an error
+ * was encountered when parsing the blob, the function returns NULL
+ */
+char **qemu_fdt_node_unit_path(void *fdt, const char *name, Error **errp);
+
int qemu_fdt_setprop(void *fdt, const char *node_path,
const char *property, const void *val, int size);
int qemu_fdt_setprop_cell(void *fdt, const char *node_path,
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 13bc78d0c8..942a1b661f 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -583,7 +583,9 @@ static uint32_t get_elf_hwcap(void)
ARM_HWCAP_A64_FPHP | ARM_HWCAP_A64_ASIMDHP);
GET_FEATURE(ARM_FEATURE_V8_ATOMICS, ARM_HWCAP_A64_ATOMICS);
GET_FEATURE(ARM_FEATURE_V8_RDM, ARM_HWCAP_A64_ASIMDRDM);
+ GET_FEATURE(ARM_FEATURE_V8_DOTPROD, ARM_HWCAP_A64_ASIMDDP);
GET_FEATURE(ARM_FEATURE_V8_FCMA, ARM_HWCAP_A64_FCMA);
+ GET_FEATURE(ARM_FEATURE_SVE, ARM_HWCAP_A64_SVE);
#undef GET_FEATURE
return hwcaps;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 2ae4fffafb..82ff450f9a 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -164,6 +164,13 @@ static void arm_cpu_reset(CPUState *s)
env->cp15.sctlr_el[1] |= SCTLR_UCT | SCTLR_UCI | SCTLR_DZE;
/* and to the FP/Neon instructions */
env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 20, 2, 3);
+ /* and to the SVE instructions */
+ env->cp15.cpacr_el1 = deposit64(env->cp15.cpacr_el1, 16, 2, 3);
+ env->cp15.cptr_el[3] |= CPTR_EZ;
+ /* with maximum vector length */
+ env->vfp.zcr_el[1] = ARM_MAX_VQ - 1;
+ env->vfp.zcr_el[2] = ARM_MAX_VQ - 1;
+ env->vfp.zcr_el[3] = ARM_MAX_VQ - 1;
#else
/* Reset into the highest available EL */
if (arm_feature(env, ARM_FEATURE_EL3)) {
@@ -793,9 +800,20 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
/* Some features automatically imply others: */
if (arm_feature(env, ARM_FEATURE_V8)) {
- set_feature(env, ARM_FEATURE_V7);
+ set_feature(env, ARM_FEATURE_V7VE);
+ }
+ if (arm_feature(env, ARM_FEATURE_V7VE)) {
+ /* v7 Virtualization Extensions. In real hardware this implies
+ * EL2 and also the presence of the Security Extensions.
+ * For QEMU, for backwards-compatibility we implement some
+ * CPUs or CPU configs which have no actual EL2 or EL3 but do
+ * include the various other features that V7VE implies.
+ * Presence of EL2 itself is ARM_FEATURE_EL2, and of the
+ * Security Extensions is ARM_FEATURE_EL3.
+ */
set_feature(env, ARM_FEATURE_ARM_DIV);
set_feature(env, ARM_FEATURE_LPAE);
+ set_feature(env, ARM_FEATURE_V7);
}
if (arm_feature(env, ARM_FEATURE_V7)) {
set_feature(env, ARM_FEATURE_VAPA);
@@ -1255,6 +1273,7 @@ static void cortex_m3_initfn(Object *obj)
cpu->id_isar3 = 0x01111110;
cpu->id_isar4 = 0x01310102;
cpu->id_isar5 = 0x00000000;
+ cpu->id_isar6 = 0x00000000;
}
static void cortex_m4_initfn(Object *obj)
@@ -1281,6 +1300,7 @@ static void cortex_m4_initfn(Object *obj)
cpu->id_isar3 = 0x01111110;
cpu->id_isar4 = 0x01310102;
cpu->id_isar5 = 0x00000000;
+ cpu->id_isar6 = 0x00000000;
}
static void cortex_m33_initfn(Object *obj)
@@ -1309,6 +1329,7 @@ static void cortex_m33_initfn(Object *obj)
cpu->id_isar3 = 0x01111131;
cpu->id_isar4 = 0x01310132;
cpu->id_isar5 = 0x00000000;
+ cpu->id_isar6 = 0x00000000;
cpu->clidr = 0x00000000;
cpu->ctr = 0x8000c000;
}
@@ -1359,6 +1380,7 @@ static void cortex_r5_initfn(Object *obj)
cpu->id_isar3 = 0x01112131;
cpu->id_isar4 = 0x0010142;
cpu->id_isar5 = 0x0;
+ cpu->id_isar6 = 0x0;
cpu->mp_is_up = true;
cpu->pmsav7_dregion = 16;
define_arm_cp_regs(cpu, cortexr5_cp_reginfo);
@@ -1517,15 +1539,13 @@ static void cortex_a7_initfn(Object *obj)
ARMCPU *cpu = ARM_CPU(obj);
cpu->dtb_compatible = "arm,cortex-a7";
- set_feature(&cpu->env, ARM_FEATURE_V7);
+ set_feature(&cpu->env, ARM_FEATURE_V7VE);
set_feature(&cpu->env, ARM_FEATURE_VFP4);
set_feature(&cpu->env, ARM_FEATURE_NEON);
set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
- set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
set_feature(&cpu->env, ARM_FEATURE_CBAR_RO);
- set_feature(&cpu->env, ARM_FEATURE_LPAE);
set_feature(&cpu->env, ARM_FEATURE_EL3);
cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A7;
cpu->midr = 0x410fc075;
@@ -1562,15 +1582,13 @@ static void cortex_a15_initfn(Object *obj)
ARMCPU *cpu = ARM_CPU(obj);
cpu->dtb_compatible = "arm,cortex-a15";
- set_feature(&cpu->env, ARM_FEATURE_V7);
+ set_feature(&cpu->env, ARM_FEATURE_V7VE);
set_feature(&cpu->env, ARM_FEATURE_VFP4);
set_feature(&cpu->env, ARM_FEATURE_NEON);
set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
- set_feature(&cpu->env, ARM_FEATURE_ARM_DIV);
set_feature(&cpu->env, ARM_FEATURE_GENERIC_TIMER);
set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);
set_feature(&cpu->env, ARM_FEATURE_CBAR_RO);
- set_feature(&cpu->env, ARM_FEATURE_LPAE);
set_feature(&cpu->env, ARM_FEATURE_EL3);
cpu->kvm_target = QEMU_KVM_ARM_TARGET_CORTEX_A15;
cpu->midr = 0x412fc0f1;
@@ -1789,15 +1807,13 @@ static void arm_max_initfn(Object *obj)
* since we don't correctly set the ID registers to advertise them,
*/
set_feature(&cpu->env, ARM_FEATURE_V8);
- set_feature(&cpu->env, ARM_FEATURE_VFP4);
- set_feature(&cpu->env, ARM_FEATURE_NEON);
- set_feature(&cpu->env, ARM_FEATURE_THUMB2EE);
set_feature(&cpu->env, ARM_FEATURE_V8_AES);
set_feature(&cpu->env, ARM_FEATURE_V8_SHA1);
set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
set_feature(&cpu->env, ARM_FEATURE_CRC);
set_feature(&cpu->env, ARM_FEATURE_V8_RDM);
+ set_feature(&cpu->env, ARM_FEATURE_V8_DOTPROD);
set_feature(&cpu->env, ARM_FEATURE_V8_FCMA);
#endif
}
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index a4507a2d6f..e310ffc29d 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -813,6 +813,7 @@ struct ARMCPU {
uint32_t id_isar3;
uint32_t id_isar4;
uint32_t id_isar5;
+ uint32_t id_isar6;
uint64_t id_aa64pfr0;
uint64_t id_aa64pfr1;
uint64_t id_aa64dfr0;
@@ -1442,6 +1443,7 @@ enum arm_features {
ARM_FEATURE_OMAPCP, /* OMAP specific CP15 ops handling. */
ARM_FEATURE_THUMB2EE,
ARM_FEATURE_V7MP, /* v7 Multiprocessing Extensions */
+ ARM_FEATURE_V7VE, /* v7 Virtualization Extensions (non-EL2 parts) */
ARM_FEATURE_V4T,
ARM_FEATURE_V5,
ARM_FEATURE_STRONGARM,
@@ -1480,6 +1482,7 @@ enum arm_features {
ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */
ARM_FEATURE_V8_ATOMICS, /* ARMv8.1-Atomics feature */
ARM_FEATURE_V8_RDM, /* implements v8.1 simd round multiply */
+ ARM_FEATURE_V8_DOTPROD, /* implements v8.2 simd dot product */
ARM_FEATURE_V8_FP16, /* implements v8.2 half-precision float */
ARM_FEATURE_V8_FCMA, /* has complex number part of v8.3 extensions. */
ARM_FEATURE_M_MAIN, /* M profile Main Extension */
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index c50dcd4077..d0581d59d8 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -139,6 +139,7 @@ static void aarch64_a57_initfn(Object *obj)
cpu->id_isar3 = 0x01112131;
cpu->id_isar4 = 0x00011142;
cpu->id_isar5 = 0x00011121;
+ cpu->id_isar6 = 0;
cpu->id_aa64pfr0 = 0x00002222;
cpu->id_aa64dfr0 = 0x10305106;
cpu->pmceid0 = 0x00000000;
@@ -199,6 +200,7 @@ static void aarch64_a53_initfn(Object *obj)
cpu->id_isar3 = 0x01112131;
cpu->id_isar4 = 0x00011142;
cpu->id_isar5 = 0x00011121;
+ cpu->id_isar6 = 0;
cpu->id_aa64pfr0 = 0x00002222;
cpu->id_aa64dfr0 = 0x10305106;
cpu->id_aa64isar0 = 0x00011120;
@@ -235,23 +237,16 @@ static void aarch64_max_initfn(Object *obj)
* whereas the architecture requires them to be present in both if
* present in either.
*/
- set_feature(&cpu->env, ARM_FEATURE_V8);
- set_feature(&cpu->env, ARM_FEATURE_VFP4);
- set_feature(&cpu->env, ARM_FEATURE_NEON);
- set_feature(&cpu->env, ARM_FEATURE_AARCH64);
- set_feature(&cpu->env, ARM_FEATURE_V8_AES);
- set_feature(&cpu->env, ARM_FEATURE_V8_SHA1);
- set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
set_feature(&cpu->env, ARM_FEATURE_V8_SHA512);
set_feature(&cpu->env, ARM_FEATURE_V8_SHA3);
set_feature(&cpu->env, ARM_FEATURE_V8_SM3);
set_feature(&cpu->env, ARM_FEATURE_V8_SM4);
- set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
- set_feature(&cpu->env, ARM_FEATURE_CRC);
set_feature(&cpu->env, ARM_FEATURE_V8_ATOMICS);
set_feature(&cpu->env, ARM_FEATURE_V8_RDM);
+ set_feature(&cpu->env, ARM_FEATURE_V8_DOTPROD);
set_feature(&cpu->env, ARM_FEATURE_V8_FP16);
set_feature(&cpu->env, ARM_FEATURE_V8_FCMA);
+ set_feature(&cpu->env, ARM_FEATURE_SVE);
/* For usermode -cpu max we can use a larger and more efficient DCZ
* blocksize since we don't have to follow what the hardware does.
*/
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 2e76084992..023952a9a4 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -274,6 +274,11 @@ DEF_HELPER_FLAGS_3(sve_clr_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sve_clr_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sve_clr_d, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_movz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_movz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_movz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_movz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(sve_asr_zpzi_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_asr_zpzi_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve_asr_zpzi_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -719,3 +724,680 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_faddv_h, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_faddv_s, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_faddv_d, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_h, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_s, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxnmv_d, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fminnmv_h, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminnmv_s, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminnmv_d, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fmaxv_h, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxv_s, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fmaxv_d, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_fminv_h, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminv_s, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sve_fminv_d, TCG_CALL_NO_RWG,
+ i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG,
+ i64, i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG,
+ i64, i64, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fadda_d, TCG_CALL_NO_RWG,
+ i64, i64, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmge0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmge0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmge0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmgt0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmgt0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmgt0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmlt0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmlt0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmlt0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmle0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmle0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmle0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmeq0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmeq0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmeq0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcmne0_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmne0_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcmne0_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fadd_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadd_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadd_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsub_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsub_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsub_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmul_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmul_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmul_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fdiv_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fdiv_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fdiv_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmin_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmin_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmin_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmax_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmax_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fminnum_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnum_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxnum_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnum_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnum_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fabd_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fabd_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fabd_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fscalbn_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fscalbn_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fscalbn_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmulx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmulx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmulx_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fadds_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadds_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fadds_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsubs_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubs_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubs_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmuls_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmuls_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmuls_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fsubrs_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubrs_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fsubrs_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxnms_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnms_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxnms_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fminnms_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnms_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fminnms_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmaxs_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxs_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmaxs_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fmins_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmins_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fmins_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, i64, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcvt_sh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_dh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_hs, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_ds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_hd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvt_sd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hs, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_ss, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_ds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_hd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_sd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzs_dd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hs, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_ss, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_ds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_hd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_sd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fcvtzu_dd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_frint_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frint_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frint_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_frintx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frintx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frintx_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_frecpx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frecpx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_frecpx_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_fsqrt_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fsqrt_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_fsqrt_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_scvt_hh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_sh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_dh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_ss, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_sd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_ds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_scvt_dd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_ucvt_hh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_sh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_dh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_ss, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_sd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_ds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ucvt_dd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmge_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmge_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmge_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmgt_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmgt_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmgt_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmeq_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmeq_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmeq_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmne_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmne_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmne_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcmuo_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmuo_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcmuo_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_facge_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facge_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facge_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_facgt_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facgt_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_facgt_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve_fcadd_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcadd_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve_fcadd_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fnmls_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_h, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_s, TCG_CALL_NO_RWG, void, env, ptr, i32)
+DEF_HELPER_FLAGS_3(sve_fcmla_zpzzz_d, TCG_CALL_NO_RWG, void, env, ptr, i32)
+
+DEF_HELPER_FLAGS_5(sve_ftmad_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ftmad_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_ftmad_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ld1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ld1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldff1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldff1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1bds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1hds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1sdu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_ldnf1sds_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_ldnf1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4hh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4ss_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st2dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st3dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st4dd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1bh_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1bs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1bd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1hs_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+DEF_HELPER_FLAGS_4(sve_st1hd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_4(sve_st1sd_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhsu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldssu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbss_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhss_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbsu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhsu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldssu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbss_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhss_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldbdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldddu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldbds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldhds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldsds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhsu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffssu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffbss_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhss_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldffbsu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhsu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffssu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffbss_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhss_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldffbdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsdu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffddu_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffbds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsds_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldffbdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsdu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffddu_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffbds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsds_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_ldffbdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsdu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffddu_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffbds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffhds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_ldffsds_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sths_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stss_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbs_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sths_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stss_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zsu, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zss, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+
+DEF_HELPER_FLAGS_6(sve_stbd_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_sthd_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stsd_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
+DEF_HELPER_FLAGS_6(sve_stdd_zd, TCG_CALL_NO_WG,
+ void, env, ptr, ptr, ptr, tl, i32)
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 3c6a4c565b..a2ac96084e 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -1404,7 +1404,7 @@ static const ARMCPRegInfo v7_cp_reginfo[] = {
.writefn = pmuserenr_write, .raw_writefn = raw_write },
{ .name = "PMINTENSET", .cp = 15, .crn = 9, .crm = 14, .opc1 = 0, .opc2 = 1,
.access = PL1_RW, .accessfn = access_tpm,
- .type = ARM_CP_ALIAS,
+ .type = ARM_CP_ALIAS | ARM_CP_IO,
.fieldoffset = offsetoflow32(CPUARMState, cp15.c9_pminten),
.resetvalue = 0,
.writefn = pmintenset_write, .raw_writefn = raw_write },
@@ -2167,11 +2167,32 @@ static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
};
#else
-/* In user-mode none of the generic timer registers are accessible,
- * and their implementation depends on QEMU_CLOCK_VIRTUAL and qdev gpio outputs,
- * so instead just don't register any of them.
+
+/* In user-mode most of the generic timer registers are inaccessible
+ * however modern kernels (4.12+) allow access to cntvct_el0
*/
+
+static uint64_t gt_virt_cnt_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+ /* Currently we have no support for QEMUTimer in linux-user so we
+ * can't call gt_get_countervalue(env), instead we directly
+ * call the lower level functions.
+ */
+ return cpu_get_clock() / GTIMER_SCALE;
+}
+
static const ARMCPRegInfo generic_timer_cp_reginfo[] = {
+ { .name = "CNTFRQ_EL0", .state = ARM_CP_STATE_AA64,
+ .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 0,
+ .type = ARM_CP_CONST, .access = PL0_R /* no PL1_RW in linux-user */,
+ .fieldoffset = offsetof(CPUARMState, cp15.c14_cntfrq),
+ .resetvalue = NANOSECONDS_PER_SECOND / GTIMER_SCALE,
+ },
+ { .name = "CNTVCT_EL0", .state = ARM_CP_STATE_AA64,
+ .opc0 = 3, .opc1 = 3, .crn = 14, .crm = 0, .opc2 = 2,
+ .access = PL0_R, .type = ARM_CP_NO_RAW | ARM_CP_IO,
+ .readfn = gt_virt_cnt_read,
+ },
REGINFO_SENTINEL
};
@@ -4393,7 +4414,7 @@ static void zcr_write(CPUARMState *env, const ARMCPRegInfo *ri,
static const ARMCPRegInfo zcr_el1_reginfo = {
.name = "ZCR_EL1", .state = ARM_CP_STATE_AA64,
.opc0 = 3, .opc1 = 0, .crn = 1, .crm = 2, .opc2 = 0,
- .access = PL1_RW, .type = ARM_CP_SVE | ARM_CP_FPU,
+ .access = PL1_RW, .type = ARM_CP_SVE,
.fieldoffset = offsetof(CPUARMState, vfp.zcr_el[1]),
.writefn = zcr_write, .raw_writefn = raw_write
};
@@ -4401,7 +4422,7 @@ static const ARMCPRegInfo zcr_el1_reginfo = {
static const ARMCPRegInfo zcr_el2_reginfo = {
.name = "ZCR_EL2", .state = ARM_CP_STATE_AA64,
.opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0,
- .access = PL2_RW, .type = ARM_CP_SVE | ARM_CP_FPU,
+ .access = PL2_RW, .type = ARM_CP_SVE,
.fieldoffset = offsetof(CPUARMState, vfp.zcr_el[2]),
.writefn = zcr_write, .raw_writefn = raw_write
};
@@ -4409,14 +4430,14 @@ static const ARMCPRegInfo zcr_el2_reginfo = {
static const ARMCPRegInfo zcr_no_el2_reginfo = {
.name = "ZCR_EL2", .state = ARM_CP_STATE_AA64,
.opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0,
- .access = PL2_RW, .type = ARM_CP_SVE | ARM_CP_FPU,
+ .access = PL2_RW, .type = ARM_CP_SVE,
.readfn = arm_cp_read_zero, .writefn = arm_cp_write_ignore
};
static const ARMCPRegInfo zcr_el3_reginfo = {
.name = "ZCR_EL3", .state = ARM_CP_STATE_AA64,
.opc0 = 3, .opc1 = 6, .crn = 1, .crm = 2, .opc2 = 0,
- .access = PL3_RW, .type = ARM_CP_SVE | ARM_CP_FPU,
+ .access = PL3_RW, .type = ARM_CP_SVE,
.fieldoffset = offsetof(CPUARMState, vfp.zcr_el[3]),
.writefn = zcr_write, .raw_writefn = raw_write
};
@@ -4851,11 +4872,10 @@ void register_cp_regs_for_features(ARMCPU *cpu)
.opc0 = 3, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 6,
.access = PL1_R, .type = ARM_CP_CONST,
.resetvalue = cpu->id_mmfr4 },
- /* 7 is as yet unallocated and must RAZ */
- { .name = "ID_ISAR7_RESERVED", .state = ARM_CP_STATE_BOTH,
+ { .name = "ID_ISAR6", .state = ARM_CP_STATE_BOTH,
.opc0 = 3, .opc1 = 0, .crn = 0, .crm = 2, .opc2 = 7,
.access = PL1_R, .type = ARM_CP_CONST,
- .resetvalue = 0 },
+ .resetvalue = cpu->id_isar6 },
REGINFO_SENTINEL
};
define_arm_cp_regs(cpu, v6_idregs);
@@ -11407,7 +11427,7 @@ ftype HELPER(name)(uint32_t x, void *fpstp) \
}
#define CONV_FTOI(name, ftype, fsz, sign, round) \
-uint32_t HELPER(name)(ftype x, void *fpstp) \
+sign##int32_t HELPER(name)(ftype x, void *fpstp) \
{ \
float_status *fpst = fpstp; \
if (float##fsz##_is_any_nan(x)) { \
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 879a7229e9..59e8c3bd1b 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -134,12 +134,12 @@ DEF_HELPER_2(vfp_touid, i32, f64, ptr)
DEF_HELPER_2(vfp_touizh, i32, f16, ptr)
DEF_HELPER_2(vfp_touizs, i32, f32, ptr)
DEF_HELPER_2(vfp_touizd, i32, f64, ptr)
-DEF_HELPER_2(vfp_tosih, i32, f16, ptr)
-DEF_HELPER_2(vfp_tosis, i32, f32, ptr)
-DEF_HELPER_2(vfp_tosid, i32, f64, ptr)
-DEF_HELPER_2(vfp_tosizh, i32, f16, ptr)
-DEF_HELPER_2(vfp_tosizs, i32, f32, ptr)
-DEF_HELPER_2(vfp_tosizd, i32, f64, ptr)
+DEF_HELPER_2(vfp_tosih, s32, f16, ptr)
+DEF_HELPER_2(vfp_tosis, s32, f32, ptr)
+DEF_HELPER_2(vfp_tosid, s32, f64, ptr)
+DEF_HELPER_2(vfp_tosizh, s32, f16, ptr)
+DEF_HELPER_2(vfp_tosizs, s32, f32, ptr)
+DEF_HELPER_2(vfp_tosizd, s32, f64, ptr)
DEF_HELPER_3(vfp_toshs_round_to_zero, i32, f32, i32, ptr)
DEF_HELPER_3(vfp_tosls_round_to_zero, i32, f32, i32, ptr)
@@ -583,6 +583,16 @@ DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sdot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_udot_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sdot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_udot_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sdot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_udot_idx_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sdot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_udot_idx_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fcadds, TCG_CALL_NO_RWG,
@@ -601,6 +611,14 @@ DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_5(gvec_fadd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(gvec_fadd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
@@ -620,6 +638,20 @@ DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_ftsmul_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmul_idx_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_h, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
index 1740cda47d..4e91c11796 100644
--- a/target/arm/kvm32.c
+++ b/target/arm/kvm32.c
@@ -36,7 +36,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
* and then query that CPU for the relevant ID registers.
*/
int i, ret, fdarray[3];
- uint32_t midr, id_pfr0, id_isar0, mvfr1;
+ uint32_t midr, id_pfr0, mvfr1;
uint64_t features = 0;
/* Old kernels may not know about the PREFERRED_TARGET ioctl: however
* we know these will only support creating one kind of guest CPU,
@@ -60,11 +60,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
},
{
.id = KVM_REG_ARM | KVM_REG_SIZE_U32
- | ENCODE_CP_REG(15, 0, 0, 0, 2, 0, 0),
- .addr = (uintptr_t)&id_isar0,
- },
- {
- .id = KVM_REG_ARM | KVM_REG_SIZE_U32
| KVM_REG_ARM_VFP | KVM_REG_ARM_VFP_MVFR1,
.addr = (uintptr_t)&mvfr1,
},
@@ -98,26 +93,14 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
/* Now we've retrieved all the register information we can
* set the feature bits based on the ID register fields.
* We can assume any KVM supporting CPU is at least a v7
- * with VFPv3, LPAE and the generic timers; this in turn implies
- * most of the other feature bits, but a few must be tested.
+ * with VFPv3, virtualization extensions, and the generic
+ * timers; this in turn implies most of the other feature
+ * bits, but a few must be tested.
*/
- set_feature(&features, ARM_FEATURE_V7);
+ set_feature(&features, ARM_FEATURE_V7VE);
set_feature(&features, ARM_FEATURE_VFP3);
- set_feature(&features, ARM_FEATURE_LPAE);
set_feature(&features, ARM_FEATURE_GENERIC_TIMER);
- switch (extract32(id_isar0, 24, 4)) {
- case 1:
- set_feature(&features, ARM_FEATURE_THUMB_DIV);
- break;
- case 2:
- set_feature(&features, ARM_FEATURE_ARM_DIV);
- set_feature(&features, ARM_FEATURE_THUMB_DIV);
- break;
- default:
- break;
- }
-
if (extract32(id_pfr0, 12, 4) == 1) {
set_feature(&features, ARM_FEATURE_THUMB2EE);
}
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 6f436f9096..e10b689454 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -27,6 +27,9 @@
%imm7_22_16 22:2 16:5
%imm8_16_10 16:5 10:3
%imm9_16_10 16:s6 10:3
+%size_23 23:2
+%dtype_23_13 23:2 13:2
+%index3_22_19 22:1 19:2
# A combination of tsz:imm3 -- extract esize.
%tszimm_esz 22:2 5:5 !function=tszimm_esz
@@ -45,6 +48,9 @@
# Unsigned 8-bit immediate, optionally shifted left by 8.
%sh8_i8u 5:9 !function=expand_imm_sh8u
+# Unsigned load of msz into esz=2, represented as a dtype.
+%msz_dtype 23:2 !function=msz_dtype
+
# Either a copy of rd (at bit 0), or a different source
# as propagated via the MOVPRFX instruction.
%reg_movprfx 0:5
@@ -71,6 +77,14 @@
&incdec2_cnt rd rn pat esz imm d u
&incdec_pred rd pg esz d u
&incdec2_pred rd rn pg esz d u
+&rprr_load rd pg rn rm dtype nreg
+&rpri_load rd pg rn imm dtype nreg
+&rprr_store rd pg rn rm msz esz nreg
+&rpri_store rd pg rn imm msz esz nreg
+&rprr_gather_load rd pg rn rm esz msz u ff xs scale
+&rpri_gather_load rd pg rn imm esz msz u ff
+&rprr_scatter_store rd pg rn rm esz msz xs scale
+&rpri_scatter_store rd pg rn imm esz msz
###########################################################################
# Named instruction formats. These are generally used to
@@ -120,10 +134,16 @@
&rprrr_esz ra=%reg_movprfx
@rdn_pg_ra_rm ........ esz:2 . rm:5 ... pg:3 ra:5 rd:5 \
&rprrr_esz rn=%reg_movprfx
+@rdn_pg_rm_ra ........ esz:2 . ra:5 ... pg:3 rm:5 rd:5 \
+ &rprrr_esz rn=%reg_movprfx
# One register operand, with governing predicate, vector element size
@rd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 &rpr_esz
@rd_pg4_pn ........ esz:2 ... ... .. pg:4 . rn:4 rd:5 &rpr_esz
+@pd_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 . rd:4 &rpr_esz
+
+# One register operand, with governing predicate, no vector element size
+@rd_pg_rn_e0 ........ .. ... ... ... pg:3 rn:5 rd:5 &rpr_esz esz=0
# Two register operands with a 6-bit signed immediate.
@rd_rn_i6 ........ ... rn:5 ..... imm:s6 rd:5 &rri
@@ -142,6 +162,10 @@
@rdn_pg4 ........ esz:2 .. pg:4 ... ........ rd:5 \
&rpri_esz rn=%reg_movprfx
+# Two register operand, one one-bit floating-point operand.
+@rdn_i1 ........ esz:2 ......... pg:3 .... imm:1 rd:5 \
+ &rpri_esz rn=%reg_movprfx
+
# Two register operand, one encoded bitmask.
@rdn_dbm ........ .. .... dbm:13 rd:5 \
&rr_dbm rn=%reg_movprfx
@@ -170,6 +194,41 @@
@incdec2_pred ........ esz:2 .... .. ..... .. pg:4 rd:5 \
&incdec2_pred rn=%reg_movprfx
+# Loads; user must fill in NREG.
+@rprr_load_dt ....... dtype:4 rm:5 ... pg:3 rn:5 rd:5 &rprr_load
+@rpri_load_dt ....... dtype:4 . imm:s4 ... pg:3 rn:5 rd:5 &rpri_load
+
+@rprr_load_msz ....... .... rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_load dtype=%msz_dtype
+@rpri_load_msz ....... .... . imm:s4 ... pg:3 rn:5 rd:5 \
+ &rpri_load dtype=%msz_dtype
+
+# Gather Loads.
+@rprr_g_load_u ....... .. . . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rprr_g_load_xs_u ....... .. xs:1 . rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_xs_u_sc ....... .. xs:1 scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_xs_sc ....... .. xs:1 scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load
+@rprr_g_load_u_sc ....... .. . scale:1 rm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rprr_g_load_sc ....... .. . scale:1 rm:5 . . ff:1 pg:3 rn:5 rd:5 \
+ &rprr_gather_load xs=2
+@rpri_g_load ....... msz:2 .. imm:5 . u:1 ff:1 pg:3 rn:5 rd:5 \
+ &rpri_gather_load
+
+# Stores; user must fill in ESZ, MSZ, NREG as needed.
+@rprr_store ....... .. .. rm:5 ... pg:3 rn:5 rd:5 &rprr_store
+@rpri_store_msz ....... msz:2 .. . imm:s4 ... pg:3 rn:5 rd:5 &rpri_store
+@rprr_store_esz_n0 ....... .. esz:2 rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_store nreg=0
+@rprr_scatter_store ....... msz:2 .. rm:5 ... pg:3 rn:5 rd:5 \
+ &rprr_scatter_store
+@rpri_scatter_store ....... msz:2 .. imm:5 ... pg:3 rn:5 rd:5 \
+ &rpri_scatter_store
+
###########################################################################
# Instruction patterns. Grouped according to the SVE encodingindex.xhtml.
@@ -211,6 +270,10 @@ ORV 00000100 .. 011 000 001 ... ..... ..... @rd_pg_rn
EORV 00000100 .. 011 001 001 ... ..... ..... @rd_pg_rn
ANDV 00000100 .. 011 010 001 ... ..... ..... @rd_pg_rn
+# SVE constructive prefix (predicated)
+MOVPRFX_z 00000100 .. 010 000 001 ... ..... ..... @rd_pg_rn
+MOVPRFX_m 00000100 .. 010 001 001 ... ..... ..... @rd_pg_rn
+
# SVE integer add reduction (predicated)
# Note that saddv requires size != 3.
UADDV 00000100 .. 000 001 001 ... ..... ..... @rd_pg_rn
@@ -271,6 +334,17 @@ UXTH 00000100 .. 010 011 101 ... ..... ..... @rd_pg_rn
SXTW 00000100 .. 010 100 101 ... ..... ..... @rd_pg_rn
UXTW 00000100 .. 010 101 101 ... ..... ..... @rd_pg_rn
+### SVE Floating Point Compare - Vectors Group
+
+# SVE floating-point compare vectors
+FCMGE_ppzz 01100101 .. 0 ..... 010 ... ..... 0 .... @pd_pg_rn_rm
+FCMGT_ppzz 01100101 .. 0 ..... 010 ... ..... 1 .... @pd_pg_rn_rm
+FCMEQ_ppzz 01100101 .. 0 ..... 011 ... ..... 0 .... @pd_pg_rn_rm
+FCMNE_ppzz 01100101 .. 0 ..... 011 ... ..... 1 .... @pd_pg_rn_rm
+FCMUO_ppzz 01100101 .. 0 ..... 110 ... ..... 0 .... @pd_pg_rn_rm
+FACGE_ppzz 01100101 .. 0 ..... 110 ... ..... 1 .... @pd_pg_rn_rm
+FACGT_ppzz 01100101 .. 0 ..... 111 ... ..... 1 .... @pd_pg_rn_rm
+
### SVE Integer Multiply-Add Group
# SVE integer multiply-add writing addend (predicated)
@@ -348,6 +422,9 @@ ADR_p64 00000100 11 1 ..... 1010 .. ..... ..... @rd_rn_msz_rm
### SVE Integer Misc - Unpredicated Group
+# SVE constructive prefix (unpredicated)
+MOVPRFX 00000100 00 1 00000 101111 rn:5 rd:5
+
# SVE floating-point exponential accelerator
# Note esz != 0
FEXPA 00000100 .. 1 00000 101110 ..... ..... @rd_rn
@@ -648,6 +725,74 @@ UMIN_zzi 00100101 .. 101 011 110 ........ ..... @rdn_i8u
# SVE integer multiply immediate (unpredicated)
MUL_zzi 00100101 .. 110 000 110 ........ ..... @rdn_i8s
+# SVE integer dot product (unpredicated)
+DOT_zzz 01000100 1 sz:1 0 rm:5 00000 u:1 rn:5 rd:5 ra=%reg_movprfx
+
+# SVE integer dot product (indexed)
+DOT_zzx 01000100 101 index:2 rm:3 00000 u:1 rn:5 rd:5 \
+ sz=0 ra=%reg_movprfx
+DOT_zzx 01000100 111 index:1 rm:4 00000 u:1 rn:5 rd:5 \
+ sz=1 ra=%reg_movprfx
+
+# SVE floating-point complex add (predicated)
+FCADD 01100100 esz:2 00000 rot:1 100 pg:3 rm:5 rd:5 \
+ rn=%reg_movprfx
+
+# SVE floating-point complex multiply-add (predicated)
+FCMLA_zpzzz 01100100 esz:2 0 rm:5 0 rot:2 pg:3 rn:5 rd:5 \
+ ra=%reg_movprfx
+
+# SVE floating-point complex multiply-add (indexed)
+FCMLA_zzxz 01100100 10 1 index:2 rm:3 0001 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx esz=1
+FCMLA_zzxz 01100100 11 1 index:1 rm:4 0001 rot:2 rn:5 rd:5 \
+ ra=%reg_movprfx esz=2
+
+### SVE FP Multiply-Add Indexed Group
+
+# SVE floating-point multiply-add (indexed)
+FMLA_zzxz 01100100 0.1 .. rm:3 00000 sub:1 rn:5 rd:5 \
+ ra=%reg_movprfx index=%index3_22_19 esz=1
+FMLA_zzxz 01100100 101 index:2 rm:3 00000 sub:1 rn:5 rd:5 \
+ ra=%reg_movprfx esz=2
+FMLA_zzxz 01100100 111 index:1 rm:4 00000 sub:1 rn:5 rd:5 \
+ ra=%reg_movprfx esz=3
+
+### SVE FP Multiply Indexed Group
+
+# SVE floating-point multiply (indexed)
+FMUL_zzx 01100100 0.1 .. rm:3 001000 rn:5 rd:5 \
+ index=%index3_22_19 esz=1
+FMUL_zzx 01100100 101 index:2 rm:3 001000 rn:5 rd:5 esz=2
+FMUL_zzx 01100100 111 index:1 rm:4 001000 rn:5 rd:5 esz=3
+
+### SVE FP Fast Reduction Group
+
+FADDV 01100101 .. 000 000 001 ... ..... ..... @rd_pg_rn
+FMAXNMV 01100101 .. 000 100 001 ... ..... ..... @rd_pg_rn
+FMINNMV 01100101 .. 000 101 001 ... ..... ..... @rd_pg_rn
+FMAXV 01100101 .. 000 110 001 ... ..... ..... @rd_pg_rn
+FMINV 01100101 .. 000 111 001 ... ..... ..... @rd_pg_rn
+
+## SVE Floating Point Unary Operations - Unpredicated Group
+
+FRECPE 01100101 .. 001 110 001100 ..... ..... @rd_rn
+FRSQRTE 01100101 .. 001 111 001100 ..... ..... @rd_rn
+
+### SVE FP Compare with Zero Group
+
+FCMGE_ppz0 01100101 .. 0100 00 001 ... ..... 0 .... @pd_pg_rn
+FCMGT_ppz0 01100101 .. 0100 00 001 ... ..... 1 .... @pd_pg_rn
+FCMLT_ppz0 01100101 .. 0100 01 001 ... ..... 0 .... @pd_pg_rn
+FCMLE_ppz0 01100101 .. 0100 01 001 ... ..... 1 .... @pd_pg_rn
+FCMEQ_ppz0 01100101 .. 0100 10 001 ... ..... 0 .... @pd_pg_rn
+FCMNE_ppz0 01100101 .. 0100 11 001 ... ..... 0 .... @pd_pg_rn
+
+### SVE FP Accumulating Reduction Group
+
+# SVE floating-point serial reduction (predicated)
+FADDA 01100101 .. 011 000 001 ... ..... ..... @rdn_pg_rm
+
### SVE Floating Point Arithmetic - Unpredicated Group
# SVE floating-point arithmetic (unpredicated)
@@ -658,6 +803,108 @@ FTSMUL 01100101 .. 0 ..... 000 011 ..... ..... @rd_rn_rm
FRECPS 01100101 .. 0 ..... 000 110 ..... ..... @rd_rn_rm
FRSQRTS 01100101 .. 0 ..... 000 111 ..... ..... @rd_rn_rm
+### SVE FP Arithmetic Predicated Group
+
+# SVE floating-point arithmetic (predicated)
+FADD_zpzz 01100101 .. 00 0000 100 ... ..... ..... @rdn_pg_rm
+FSUB_zpzz 01100101 .. 00 0001 100 ... ..... ..... @rdn_pg_rm
+FMUL_zpzz 01100101 .. 00 0010 100 ... ..... ..... @rdn_pg_rm
+FSUB_zpzz 01100101 .. 00 0011 100 ... ..... ..... @rdm_pg_rn # FSUBR
+FMAXNM_zpzz 01100101 .. 00 0100 100 ... ..... ..... @rdn_pg_rm
+FMINNM_zpzz 01100101 .. 00 0101 100 ... ..... ..... @rdn_pg_rm
+FMAX_zpzz 01100101 .. 00 0110 100 ... ..... ..... @rdn_pg_rm
+FMIN_zpzz 01100101 .. 00 0111 100 ... ..... ..... @rdn_pg_rm
+FABD 01100101 .. 00 1000 100 ... ..... ..... @rdn_pg_rm
+FSCALE 01100101 .. 00 1001 100 ... ..... ..... @rdn_pg_rm
+FMULX 01100101 .. 00 1010 100 ... ..... ..... @rdn_pg_rm
+FDIV 01100101 .. 00 1100 100 ... ..... ..... @rdm_pg_rn # FDIVR
+FDIV 01100101 .. 00 1101 100 ... ..... ..... @rdn_pg_rm
+
+# SVE floating-point arithmetic with immediate (predicated)
+FADD_zpzi 01100101 .. 011 000 100 ... 0000 . ..... @rdn_i1
+FSUB_zpzi 01100101 .. 011 001 100 ... 0000 . ..... @rdn_i1
+FMUL_zpzi 01100101 .. 011 010 100 ... 0000 . ..... @rdn_i1
+FSUBR_zpzi 01100101 .. 011 011 100 ... 0000 . ..... @rdn_i1
+FMAXNM_zpzi 01100101 .. 011 100 100 ... 0000 . ..... @rdn_i1
+FMINNM_zpzi 01100101 .. 011 101 100 ... 0000 . ..... @rdn_i1
+FMAX_zpzi 01100101 .. 011 110 100 ... 0000 . ..... @rdn_i1
+FMIN_zpzi 01100101 .. 011 111 100 ... 0000 . ..... @rdn_i1
+
+# SVE floating-point trig multiply-add coefficient
+FTMAD 01100101 esz:2 010 imm:3 100000 rm:5 rd:5 rn=%reg_movprfx
+
+### SVE FP Multiply-Add Group
+
+# SVE floating-point multiply-accumulate writing addend
+FMLA_zpzzz 01100101 .. 1 ..... 000 ... ..... ..... @rda_pg_rn_rm
+FMLS_zpzzz 01100101 .. 1 ..... 001 ... ..... ..... @rda_pg_rn_rm
+FNMLA_zpzzz 01100101 .. 1 ..... 010 ... ..... ..... @rda_pg_rn_rm
+FNMLS_zpzzz 01100101 .. 1 ..... 011 ... ..... ..... @rda_pg_rn_rm
+
+# SVE floating-point multiply-accumulate writing multiplicand
+# Alter the operand extraction order and reuse the helpers from above.
+# FMAD, FMSB, FNMAD, FNMS
+FMLA_zpzzz 01100101 .. 1 ..... 100 ... ..... ..... @rdn_pg_rm_ra
+FMLS_zpzzz 01100101 .. 1 ..... 101 ... ..... ..... @rdn_pg_rm_ra
+FNMLA_zpzzz 01100101 .. 1 ..... 110 ... ..... ..... @rdn_pg_rm_ra
+FNMLS_zpzzz 01100101 .. 1 ..... 111 ... ..... ..... @rdn_pg_rm_ra
+
+### SVE FP Unary Operations Predicated Group
+
+# SVE floating-point convert precision
+FCVT_sh 01100101 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_hs 01100101 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_dh 01100101 11 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_hd 01100101 11 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_ds 01100101 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0
+FCVT_sd 01100101 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
+
+# SVE floating-point convert to integer
+FCVTZS_hh 01100101 01 011 01 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hh 01100101 01 011 01 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_hs 01100101 01 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hs 01100101 01 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_hd 01100101 01 011 11 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_hd 01100101 01 011 11 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_ss 01100101 10 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_ss 01100101 10 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_ds 01100101 11 011 00 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_ds 01100101 11 011 00 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_sd 01100101 11 011 10 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_sd 01100101 11 011 10 1 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZS_dd 01100101 11 011 11 0 101 ... ..... ..... @rd_pg_rn_e0
+FCVTZU_dd 01100101 11 011 11 1 101 ... ..... ..... @rd_pg_rn_e0
+
+# SVE floating-point round to integral value
+FRINTN 01100101 .. 000 000 101 ... ..... ..... @rd_pg_rn
+FRINTP 01100101 .. 000 001 101 ... ..... ..... @rd_pg_rn
+FRINTM 01100101 .. 000 010 101 ... ..... ..... @rd_pg_rn
+FRINTZ 01100101 .. 000 011 101 ... ..... ..... @rd_pg_rn
+FRINTA 01100101 .. 000 100 101 ... ..... ..... @rd_pg_rn
+FRINTX 01100101 .. 000 110 101 ... ..... ..... @rd_pg_rn
+FRINTI 01100101 .. 000 111 101 ... ..... ..... @rd_pg_rn
+
+# SVE floating-point unary operations
+FRECPX 01100101 .. 001 100 101 ... ..... ..... @rd_pg_rn
+FSQRT 01100101 .. 001 101 101 ... ..... ..... @rd_pg_rn
+
+# SVE integer convert to floating-point
+SCVTF_hh 01100101 01 010 01 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_sh 01100101 01 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_dh 01100101 01 010 11 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_ss 01100101 10 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_sd 01100101 11 010 00 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_ds 01100101 11 010 10 0 101 ... ..... ..... @rd_pg_rn_e0
+SCVTF_dd 01100101 11 010 11 0 101 ... ..... ..... @rd_pg_rn_e0
+
+UCVTF_hh 01100101 01 010 01 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_sh 01100101 01 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_dh 01100101 01 010 11 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_ss 01100101 10 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_sd 01100101 11 010 00 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_ds 01100101 11 010 10 1 101 ... ..... ..... @rd_pg_rn_e0
+UCVTF_dd 01100101 11 010 11 1 101 ... ..... ..... @rd_pg_rn_e0
+
### SVE Memory - 32-bit Gather and Unsized Contiguous Group
# SVE load predicate register
@@ -665,3 +912,183 @@ LDR_pri 10000101 10 ...... 000 ... ..... 0 .... @pd_rn_i9
# SVE load vector register
LDR_zri 10000101 10 ...... 010 ... ..... ..... @rd_rn_i9
+
+# SVE load and broadcast element
+LD1R_zpri 1000010 .. 1 imm:6 1.. pg:3 rn:5 rd:5 \
+ &rpri_load dtype=%dtype_23_13 nreg=0
+
+# SVE 32-bit gather load (scalar plus 32-bit unscaled offsets)
+# SVE 32-bit gather load (scalar plus 32-bit scaled offsets)
+LD1_zprz 1000010 00 .0 ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u esz=2 msz=0 scale=0
+LD1_zprz 1000010 01 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=2 msz=1
+LD1_zprz 1000010 10 .. ..... 01. ... ..... ..... \
+ @rprr_g_load_xs_sc esz=2 msz=2 u=1
+
+# SVE 32-bit gather load (vector plus immediate)
+LD1_zpiz 1000010 .. 01 ..... 1.. ... ..... ..... \
+ @rpri_g_load esz=2
+
+### SVE Memory Contiguous Load Group
+
+# SVE contiguous load (scalar plus scalar)
+LD_zprr 1010010 .... ..... 010 ... ..... ..... @rprr_load_dt nreg=0
+
+# SVE contiguous first-fault load (scalar plus scalar)
+LDFF1_zprr 1010010 .... ..... 011 ... ..... ..... @rprr_load_dt nreg=0
+
+# SVE contiguous load (scalar plus immediate)
+LD_zpri 1010010 .... 0.... 101 ... ..... ..... @rpri_load_dt nreg=0
+
+# SVE contiguous non-fault load (scalar plus immediate)
+LDNF1_zpri 1010010 .... 1.... 101 ... ..... ..... @rpri_load_dt nreg=0
+
+# SVE contiguous non-temporal load (scalar plus scalar)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus scalar)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zprr 1010010 .. nreg:2 ..... 110 ... ..... ..... @rprr_load_msz
+
+# SVE contiguous non-temporal load (scalar plus immediate)
+# LDNT1B, LDNT1H, LDNT1W, LDNT1D
+# SVE load multiple structures (scalar plus immediate)
+# LD2B, LD2H, LD2W, LD2D; etc.
+LD_zpri 1010010 .. nreg:2 0.... 111 ... ..... ..... @rpri_load_msz
+
+# SVE load and broadcast quadword (scalar plus scalar)
+LD1RQ_zprr 1010010 .. 00 ..... 000 ... ..... ..... \
+ @rprr_load_msz nreg=0
+
+# SVE load and broadcast quadword (scalar plus immediate)
+# LD1RQB, LD1RQH, LD1RQS, LD1RQD
+LD1RQ_zpri 1010010 .. 00 0.... 001 ... ..... ..... \
+ @rpri_load_msz nreg=0
+
+# SVE 32-bit gather prefetch (scalar plus 32-bit scaled offsets)
+PRF 1000010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 32-bit gather prefetch (vector plus immediate)
+PRF 1000010 -- 00 ----- 111 --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus immediate)
+PRF 1000010 11 1- ----- 0-- --- ----- 0 ----
+
+# SVE contiguous prefetch (scalar plus scalar)
+PRF_rr 1000010 -- 00 rm:5 110 --- ----- 0 ----
+
+### SVE Memory 64-bit Gather Group
+
+# SVE 64-bit gather load (scalar plus 32-bit unpacked unscaled offsets)
+# SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
+LD1_zprz 1100010 00 .0 ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u esz=3 msz=0 scale=0
+LD1_zprz 1100010 01 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=3 msz=1
+LD1_zprz 1100010 10 .. ..... 0.. ... ..... ..... \
+ @rprr_g_load_xs_u_sc esz=3 msz=2
+LD1_zprz 1100010 11 .. ..... 01. ... ..... ..... \
+ @rprr_g_load_xs_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (scalar plus 64-bit unscaled offsets)
+# SVE 64-bit gather load (scalar plus 64-bit scaled offsets)
+LD1_zprz 1100010 00 10 ..... 1.. ... ..... ..... \
+ @rprr_g_load_u esz=3 msz=0 scale=0
+LD1_zprz 1100010 01 1. ..... 1.. ... ..... ..... \
+ @rprr_g_load_u_sc esz=3 msz=1
+LD1_zprz 1100010 10 1. ..... 1.. ... ..... ..... \
+ @rprr_g_load_u_sc esz=3 msz=2
+LD1_zprz 1100010 11 1. ..... 11. ... ..... ..... \
+ @rprr_g_load_sc esz=3 msz=3 u=1
+
+# SVE 64-bit gather load (vector plus immediate)
+LD1_zpiz 1100010 .. 01 ..... 1.. ... ..... ..... \
+ @rpri_g_load esz=3
+
+# SVE 64-bit gather prefetch (scalar plus 64-bit scaled offsets)
+PRF 1100010 00 11 ----- 1-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (scalar plus unpacked 32-bit scaled offsets)
+PRF 1100010 00 -1 ----- 0-- --- ----- 0 ----
+
+# SVE 64-bit gather prefetch (vector plus immediate)
+PRF 1100010 -- 00 ----- 111 --- ----- 0 ----
+
+### SVE Memory Store Group
+
+# SVE store predicate register
+STR_pri 1110010 11 0. ..... 000 ... ..... 0 .... @pd_rn_i9
+
+# SVE store vector register
+STR_zri 1110010 11 0. ..... 010 ... ..... ..... @rd_rn_i9
+
+# SVE contiguous store (scalar plus immediate)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+ST_zpri 1110010 .. esz:2 0.... 111 ... ..... ..... \
+ @rpri_store_msz nreg=0
+
+# SVE contiguous store (scalar plus scalar)
+# ST1B, ST1H, ST1W, ST1D; require msz <= esz
+# Enumerate msz lest we conflict with STR_zri.
+ST_zprr 1110010 00 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=0
+ST_zprr 1110010 01 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=1
+ST_zprr 1110010 10 .. ..... 010 ... ..... ..... \
+ @rprr_store_esz_n0 msz=2
+ST_zprr 1110010 11 11 ..... 010 ... ..... ..... \
+ @rprr_store msz=3 esz=3 nreg=0
+
+# SVE contiguous non-temporal store (scalar plus immediate) (nreg == 0)
+# SVE store multiple structures (scalar plus immediate) (nreg != 0)
+ST_zpri 1110010 .. nreg:2 1.... 111 ... ..... ..... \
+ @rpri_store_msz esz=%size_23
+
+# SVE contiguous non-temporal store (scalar plus scalar) (nreg == 0)
+# SVE store multiple structures (scalar plus scalar) (nreg != 0)
+ST_zprr 1110010 msz:2 nreg:2 ..... 011 ... ..... ..... \
+ @rprr_store esz=%size_23
+
+# SVE 32-bit scatter store (scalar plus 32-bit scaled offsets)
+# Require msz > 0 && msz <= esz.
+ST1_zprz 1110010 .. 11 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=2 scale=1
+ST1_zprz 1110010 .. 11 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=2 scale=1
+
+# SVE 32-bit scatter store (scalar plus 32-bit unscaled offsets)
+# Require msz <= esz.
+ST1_zprz 1110010 .. 10 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=2 scale=0
+ST1_zprz 1110010 .. 10 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=2 scale=0
+
+# SVE 64-bit scatter store (scalar plus 64-bit scaled offset)
+# Require msz > 0
+ST1_zprz 1110010 .. 01 ..... 101 ... ..... ..... \
+ @rprr_scatter_store xs=2 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus 64-bit unscaled offset)
+ST1_zprz 1110010 .. 00 ..... 101 ... ..... ..... \
+ @rprr_scatter_store xs=2 esz=3 scale=0
+
+# SVE 64-bit scatter store (vector plus immediate)
+ST1_zpiz 1110010 .. 10 ..... 101 ... ..... ..... \
+ @rpri_scatter_store esz=3
+
+# SVE 32-bit scatter store (vector plus immediate)
+ST1_zpiz 1110010 .. 11 ..... 101 ... ..... ..... \
+ @rpri_scatter_store esz=2
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit scaled offset)
+# Require msz > 0
+ST1_zprz 1110010 .. 01 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=3 scale=1
+ST1_zprz 1110010 .. 01 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=3 scale=1
+
+# SVE 64-bit scatter store (scalar plus unpacked 32-bit unscaled offset)
+ST1_zprz 1110010 .. 00 ..... 100 ... ..... ..... \
+ @rprr_scatter_store xs=0 esz=3 scale=0
+ST1_zprz 1110010 .. 00 ..... 110 ... ..... ..... \
+ @rprr_scatter_store xs=1 esz=3 scale=0
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index 128bbf9b04..a03ca77354 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -369,7 +369,17 @@ void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
#define DO_MUL(N, M) (N * M)
-#define DO_DIV(N, M) (M ? N / M : 0)
+
+
+/*
+ * We must avoid the C undefined behaviour cases: division by
+ * zero and signed division of INT_MIN by -1. Both of these
+ * have architecturally defined required results for Arm.
+ * We special case all signed divisions by -1 to avoid having
+ * to deduce the minimum integer for the type involved.
+ */
+#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
+#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
@@ -477,11 +487,11 @@ DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
-DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
-DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
+DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
+DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
-DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
-DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
+DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
+DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
/* Note that all bits of the shift are significant
and not modulo the element size. */
@@ -995,6 +1005,47 @@ void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
}
}
+/* Copy Zn into Zd, and store zero into inactive elements. */
+void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & expand_pred_b(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & expand_pred_h(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & expand_pred_s(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
+ }
+}
+
/* Three-operand expander, immediate operand, controlled by a predicate.
*/
#define DO_ZPZI(NAME, TYPE, H, OP) \
@@ -2810,3 +2861,1817 @@ uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
return predtest_ones(d, oprsz, esz_mask);
}
+
+/* Recursive reduction on a function;
+ * C.f. the ARM ARM function ReducePredicated.
+ *
+ * While it would be possible to write this without the DATA temporary,
+ * it is much simpler to process the predicate register this way.
+ * The recursion is bounded to depth 7 (128 fp16 elements), so there's
+ * little to gain with a more complex non-recursive form.
+ */
+#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
+static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+{ \
+ if (n == 1) { \
+ return *data; \
+ } else { \
+ uintptr_t half = n / 2; \
+ TYPE lo = NAME##_reduce(data, status, half); \
+ TYPE hi = NAME##_reduce(data + half, status, half); \
+ return TYPE##_##FUNC(lo, hi, status); \
+ } \
+} \
+uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
+{ \
+ uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
+ TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+ for (; i < maxsz; i += sizeof(TYPE)) { \
+ *(TYPE *)((void *)data + i) = IDENT; \
+ } \
+ return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
+}
+
+DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
+DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
+DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
+
+/* Identity is floatN_default_nan, without the function call. */
+DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
+DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
+DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
+DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
+DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
+DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
+DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
+
+DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
+DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
+DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
+
+#undef DO_REDUCE
+
+uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc);
+ float16 result = nn;
+
+ do {
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ do {
+ if (pg & 1) {
+ float16 mm = *(float16 *)(vm + H1_2(i));
+ result = float16_add(result, mm, status);
+ }
+ i += sizeof(float16), pg >>= sizeof(float16);
+ } while (i & 15);
+ } while (i < opr_sz);
+
+ return result;
+}
+
+uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc);
+ float32 result = nn;
+
+ do {
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ do {
+ if (pg & 1) {
+ float32 mm = *(float32 *)(vm + H1_2(i));
+ result = float32_add(result, mm, status);
+ }
+ i += sizeof(float32), pg >>= sizeof(float32);
+ } while (i & 15);
+ } while (i < opr_sz);
+
+ return result;
+}
+
+uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i++) {
+ if (pg[H1(i)] & 1) {
+ nn = float64_add(nn, m[i], status);
+ }
+ }
+
+ return nn;
+}
+
+/* Fully general three-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
+DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
+DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
+
+DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
+DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
+DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
+
+DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
+DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
+DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
+
+DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
+DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
+DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
+
+DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
+DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
+DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
+
+DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
+DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
+DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
+
+DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
+DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
+DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
+
+DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
+
+static inline float16 abd_h(float16 a, float16 b, float_status *s)
+{
+ return float16_abs(float16_sub(a, b, s));
+}
+
+static inline float32 abd_s(float32 a, float32 b, float_status *s)
+{
+ return float32_abs(float32_sub(a, b, s));
+}
+
+static inline float64 abd_d(float64 a, float64 b, float_status *s)
+{
+ return float64_abs(float64_sub(a, b, s));
+}
+
+DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
+DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
+DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
+
+static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
+{
+ int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
+ return float64_scalbn(a, b_int, s);
+}
+
+DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
+
+DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
+DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
+DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
+
+#undef DO_ZPZZ_FP
+
+/* Three-operand expander, with one scalar operand, controlled by
+ * a predicate, with the extra float_status parameter.
+ */
+#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ TYPE mm = scalar; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
+DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
+DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
+
+DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
+DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
+DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
+
+DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
+DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
+DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
+
+static inline float16 subr_h(float16 a, float16 b, float_status *s)
+{
+ return float16_sub(b, a, s);
+}
+
+static inline float32 subr_s(float32 a, float32 b, float_status *s)
+{
+ return float32_sub(b, a, s);
+}
+
+static inline float64 subr_d(float64 a, float64 b, float_status *s)
+{
+ return float64_sub(b, a, s);
+}
+
+DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
+DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
+DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
+
+DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
+
+DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
+DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
+DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
+
+DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
+DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
+DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
+
+DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
+DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
+DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
+
+/* Fully general two-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
+ * FZ16. When converting from fp16, this affects flushing input denormals;
+ * when converting to fp16, this affects flushing output denormals.
+ */
+static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
+{
+ flag save = get_flush_inputs_to_zero(fpst);
+ float32 ret;
+
+ set_flush_inputs_to_zero(false, fpst);
+ ret = float16_to_float32(f, true, fpst);
+ set_flush_inputs_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
+{
+ flag save = get_flush_inputs_to_zero(fpst);
+ float64 ret;
+
+ set_flush_inputs_to_zero(false, fpst);
+ ret = float16_to_float64(f, true, fpst);
+ set_flush_inputs_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
+{
+ flag save = get_flush_to_zero(fpst);
+ float16 ret;
+
+ set_flush_to_zero(false, fpst);
+ ret = float32_to_float16(f, true, fpst);
+ set_flush_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
+{
+ flag save = get_flush_to_zero(fpst);
+ float16 ret;
+
+ set_flush_to_zero(false, fpst);
+ ret = float64_to_float16(f, true, fpst);
+ set_flush_to_zero(save, fpst);
+ return ret;
+}
+
+static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_int16_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
+{
+ if (float32_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float32_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
+{
+ if (float64_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float64_to_int64_round_to_zero(f, s);
+}
+
+static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_uint16_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
+{
+ if (float32_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float32_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
+{
+ if (float64_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float64_to_uint64_round_to_zero(f, s);
+}
+
+DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
+DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
+DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
+DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
+DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
+DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
+
+DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
+DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
+DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
+DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
+DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
+
+DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
+DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
+DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
+DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
+DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
+
+DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
+DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
+DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
+
+DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
+DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
+DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
+
+DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
+DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
+DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
+
+DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
+DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
+DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
+
+DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
+DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
+DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
+DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
+DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
+DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
+DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
+
+DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
+DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
+DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
+DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
+DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
+DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
+DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
+
+#undef DO_ZPZ_FP
+
+/* 4-operand predicated multiply-add. This requires 7 operands to pass
+ * "properly", so we need to encode some of the registers into DESC.
+ */
+QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
+
+static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
+ uint16_t neg1, uint16_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 2;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float16 e1, e2, e3, r;
+
+ e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
+ e2 = *(uint16_t *)(vm + H1_2(i));
+ e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
+ r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ *(uint16_t *)(vd + H1_2(i)) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_h(env, vg, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
+}
+
+void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
+}
+
+static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
+ uint32_t neg1, uint32_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 4;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float32 e1, e2, e3, r;
+
+ e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
+ e2 = *(uint32_t *)(vm + H1_4(i));
+ e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
+ r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ *(uint32_t *)(vd + H1_4(i)) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_s(env, vg, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
+}
+
+void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
+}
+
+static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
+ uint64_t neg1, uint64_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 8;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float64 e1, e2, e3, r;
+
+ e1 = *(uint64_t *)(vn + i) ^ neg1;
+ e2 = *(uint64_t *)(vm + i);
+ e3 = *(uint64_t *)(va + i) ^ neg3;
+ r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ *(uint64_t *)(vd + i) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_d(env, vg, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
+}
+
+void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
+}
+
+/* Two operand floating-point comparison controlled by a predicate.
+ * Unlike the integer version, we are not allowed to optimistically
+ * compare operands, since the comparison may have side effects wrt
+ * the FPSR.
+ */
+#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
+ uint64_t *d = vd, *g = vg; \
+ do { \
+ uint64_t out = 0, pg = g[j]; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ out |= OP(TYPE, nn, mm, status); \
+ } \
+ } while (i & 63); \
+ d[j--] = out; \
+ } while (i > 0); \
+}
+
+#define DO_FPCMP_PPZZ_H(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZZ_S(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZZ_D(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
+
+#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
+ DO_FPCMP_PPZZ_H(NAME, OP) \
+ DO_FPCMP_PPZZ_S(NAME, OP) \
+ DO_FPCMP_PPZZ_D(NAME, OP)
+
+#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
+#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
+#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
+#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
+#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
+#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
+#define DO_FCMUO(TYPE, X, Y, ST) \
+ TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
+#define DO_FACGE(TYPE, X, Y, ST) \
+ TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
+#define DO_FACGT(TYPE, X, Y, ST) \
+ TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
+
+DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
+DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
+DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
+DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
+DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
+DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
+DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
+
+#undef DO_FPCMP_PPZZ_ALL
+#undef DO_FPCMP_PPZZ_D
+#undef DO_FPCMP_PPZZ_S
+#undef DO_FPCMP_PPZZ_H
+#undef DO_FPCMP_PPZZ
+
+/* One operand floating-point comparison against zero, controlled
+ * by a predicate.
+ */
+#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
+ uint64_t *d = vd, *g = vg; \
+ do { \
+ uint64_t out = 0, pg = g[j]; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ if ((pg >> (i & 63)) & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ out |= OP(TYPE, nn, 0, status); \
+ } \
+ } while (i & 63); \
+ d[j--] = out; \
+ } while (i > 0); \
+}
+
+#define DO_FPCMP_PPZ0_H(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZ0_S(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZ0_D(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
+
+#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
+ DO_FPCMP_PPZ0_H(NAME, OP) \
+ DO_FPCMP_PPZ0_S(NAME, OP) \
+ DO_FPCMP_PPZ0_D(NAME, OP)
+
+DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
+DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
+DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
+DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
+DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
+DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
+
+/* FP Trig Multiply-Add. */
+
+void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float16 coeff[16] = {
+ 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
+ intptr_t x = simd_data(desc);
+ float16 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float16 mm = m[i];
+ intptr_t xx = x;
+ if (float16_is_neg(mm)) {
+ mm = float16_abs(mm);
+ xx += 8;
+ }
+ d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float32 coeff[16] = {
+ 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
+ 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
+ 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
+ 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
+ intptr_t x = simd_data(desc);
+ float32 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float32 mm = m[i];
+ intptr_t xx = x;
+ if (float32_is_neg(mm)) {
+ mm = float32_abs(mm);
+ xx += 8;
+ }
+ d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float64 coeff[16] = {
+ 0x3ff0000000000000ull, 0xbfc5555555555543ull,
+ 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
+ 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
+ 0x3de5d8408868552full, 0x0000000000000000ull,
+ 0x3ff0000000000000ull, 0xbfe0000000000000ull,
+ 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
+ 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
+ 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
+ intptr_t x = simd_data(desc);
+ float64 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float64 mm = m[i];
+ intptr_t xx = x;
+ if (float64_is_neg(mm)) {
+ mm = float64_abs(mm);
+ xx += 8;
+ }
+ d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+/*
+ * FP Complex Add
+ */
+
+void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float16 neg_imag = float16_set_sign(0, simd_data(desc));
+ float16 neg_real = float16_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float16 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float16);
+ i -= 2 * sizeof(float16);
+
+ e0 = *(float16 *)(vn + H1_2(i));
+ e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float16 *)(vn + H1_2(j));
+ e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float32 neg_imag = float32_set_sign(0, simd_data(desc));
+ float32 neg_real = float32_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float32 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float32);
+ i -= 2 * sizeof(float32);
+
+ e0 = *(float32 *)(vn + H1_2(i));
+ e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float32 *)(vn + H1_2(j));
+ e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float64 neg_imag = float64_set_sign(0, simd_data(desc));
+ float64 neg_real = float64_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float64 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float64);
+ i -= 2 * sizeof(float64);
+
+ e0 = *(float64 *)(vn + H1_2(i));
+ e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float64 *)(vn + H1_2(j));
+ e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+/*
+ * FP Complex Multiply
+ */
+
+QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
+
+void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ bool flip = rot & 1;
+ float16 neg_imag, neg_real;
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ neg_imag = float16_set_sign(0, (rot & 2) != 0);
+ neg_real = float16_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float16);
+ i -= 2 * sizeof(float16);
+
+ nr = *(float16 *)(vn + H1_2(i));
+ ni = *(float16 *)(vn + H1_2(j));
+ mr = *(float16 *)(vm + H1_2(i));
+ mi = *(float16 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float16 *)(va + H1_2(i));
+ d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
+ *(float16 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float16 *)(va + H1_2(j));
+ d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
+ *(float16 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ bool flip = rot & 1;
+ float32 neg_imag, neg_real;
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ neg_imag = float32_set_sign(0, (rot & 2) != 0);
+ neg_real = float32_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float32);
+ i -= 2 * sizeof(float32);
+
+ nr = *(float32 *)(vn + H1_2(i));
+ ni = *(float32 *)(vn + H1_2(j));
+ mr = *(float32 *)(vm + H1_2(i));
+ mi = *(float32 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float32 *)(va + H1_2(i));
+ d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ *(float32 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float32 *)(va + H1_2(j));
+ d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ *(float32 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
+ unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
+ unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
+ unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
+ unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ bool flip = rot & 1;
+ float64 neg_imag, neg_real;
+ void *vd = &env->vfp.zregs[rd];
+ void *vn = &env->vfp.zregs[rn];
+ void *vm = &env->vfp.zregs[rm];
+ void *va = &env->vfp.zregs[ra];
+ uint64_t *g = vg;
+
+ neg_imag = float64_set_sign(0, (rot & 2) != 0);
+ neg_real = float64_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float64);
+ i -= 2 * sizeof(float64);
+
+ nr = *(float64 *)(vn + H1_2(i));
+ ni = *(float64 *)(vn + H1_2(j));
+ mr = *(float64 *)(vm + H1_2(i));
+ mi = *(float64 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float64 *)(va + H1_2(i));
+ d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ *(float64 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float64 *)(va + H1_2(j));
+ d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ *(float64 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+/*
+ * Load contiguous data, protected by a governing predicate.
+ */
+#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
+static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
+ target_ulong addr, intptr_t oprsz, \
+ uintptr_t ra) \
+{ \
+ intptr_t i = 0; \
+ do { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m = 0; \
+ if (pg & 1) { \
+ m = FN(env, addr, ra); \
+ } \
+ *(TYPEE *)(vd + H(i)) = m; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += sizeof(TYPEM); \
+ } while (i & 15); \
+ } while (i < oprsz); \
+} \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
+ addr, simd_oprsz(desc), GETPC()); \
+}
+
+#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m1 = 0, m2 = 0; \
+ if (pg & 1) { \
+ m1 = FN(env, addr, ra); \
+ m2 = FN(env, addr + sizeof(TYPEM), ra); \
+ } \
+ *(TYPEE *)(d1 + H(i)) = m1; \
+ *(TYPEE *)(d2 + H(i)) = m2; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 2 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m1 = 0, m2 = 0, m3 = 0; \
+ if (pg & 1) { \
+ m1 = FN(env, addr, ra); \
+ m2 = FN(env, addr + sizeof(TYPEM), ra); \
+ m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
+ } \
+ *(TYPEE *)(d1 + H(i)) = m1; \
+ *(TYPEE *)(d2 + H(i)) = m2; \
+ *(TYPEE *)(d3 + H(i)) = m3; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 3 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
+ void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
+ if (pg & 1) { \
+ m1 = FN(env, addr, ra); \
+ m2 = FN(env, addr + sizeof(TYPEM), ra); \
+ m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
+ m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
+ } \
+ *(TYPEE *)(d1 + H(i)) = m1; \
+ *(TYPEE *)(d2 + H(i)) = m2; \
+ *(TYPEE *)(d3 + H(i)) = m3; \
+ *(TYPEE *)(d4 + H(i)) = m4; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 4 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
+DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
+DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
+DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
+DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
+DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
+
+DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
+DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
+DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
+DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
+
+DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
+DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
+
+DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+
+DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+
+DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+
+DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+
+#undef DO_LD1
+#undef DO_LD2
+#undef DO_LD3
+#undef DO_LD4
+
+/*
+ * Load contiguous data, first-fault and no-fault.
+ */
+
+#ifdef CONFIG_USER_ONLY
+
+/* Fault on byte I. All bits in FFR from I are cleared. The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
+ */
+static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+{
+ uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+
+ if (i & 63) {
+ ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
+ i = ROUND_UP(i, 64);
+ }
+ for (; i < oprsz; i += 64) {
+ ffr[i / 64] = 0;
+ }
+}
+
+/* Hold the mmap lock during the operation so that there is no race
+ * between page_check_range and the load operation. We expect the
+ * usual case to have no faults at all, so we check the whole range
+ * first and if successful defer to the normal load operation.
+ *
+ * TODO: Change mmap_lock to a rwlock so that multiple readers
+ * can run simultaneously. This will probably help other uses
+ * within QEMU as well.
+ */
+#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
+static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
+ target_ulong addr, intptr_t oprsz, \
+ bool first, uintptr_t ra) \
+{ \
+ intptr_t i = 0; \
+ do { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m = 0; \
+ if (pg & 1) { \
+ if (!first && \
+ unlikely(page_check_range(addr, sizeof(TYPEM), \
+ PAGE_READ))) { \
+ record_fault(env, i, oprsz); \
+ return; \
+ } \
+ m = FN(env, addr, ra); \
+ first = false; \
+ } \
+ *(TYPEE *)(vd + H(i)) = m; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += sizeof(TYPEM); \
+ } while (i & 15); \
+ } while (i < oprsz); \
+} \
+void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ unsigned rd = simd_data(desc); \
+ void *vd = &env->vfp.zregs[rd]; \
+ mmap_lock(); \
+ if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
+ do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
+ } else { \
+ do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
+ } \
+ mmap_unlock(); \
+}
+
+/* No-fault loads are like first-fault loads without the
+ * first faulting special case.
+ */
+#define DO_LDNF1(PART) \
+void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ unsigned rd = simd_data(desc); \
+ void *vd = &env->vfp.zregs[rd]; \
+ mmap_lock(); \
+ if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
+ do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
+ } else { \
+ do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
+ } \
+ mmap_unlock(); \
+}
+
+#else
+
+/* TODO: System mode is not yet supported.
+ * This would probably use tlb_vaddr_to_host.
+ */
+#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
+void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ g_assert_not_reached(); \
+}
+
+#define DO_LDNF1(PART) \
+void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ g_assert_not_reached(); \
+}
+
+#endif
+
+DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
+DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
+DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
+DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
+DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
+DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
+DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
+
+DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
+DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
+DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
+DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
+DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
+
+DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
+DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
+DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
+
+DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+
+#undef DO_LDFF1
+
+DO_LDNF1(bb_r)
+DO_LDNF1(bhu_r)
+DO_LDNF1(bhs_r)
+DO_LDNF1(bsu_r)
+DO_LDNF1(bss_r)
+DO_LDNF1(bdu_r)
+DO_LDNF1(bds_r)
+
+DO_LDNF1(hh_r)
+DO_LDNF1(hsu_r)
+DO_LDNF1(hss_r)
+DO_LDNF1(hdu_r)
+DO_LDNF1(hds_r)
+
+DO_LDNF1(ss_r)
+DO_LDNF1(sdu_r)
+DO_LDNF1(sds_r)
+
+DO_LDNF1(dd_r)
+
+#undef DO_LDNF1
+
+/*
+ * Store contiguous data, protected by a governing predicate.
+ */
+#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *vd = &env->vfp.zregs[rd]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPEM m = *(TYPEE *)(vd + H(i)); \
+ FN(env, addr, m, ra); \
+ } \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_ST1_D(NAME, FN, TYPEM) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc) / 8; \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ uint64_t *d = &env->vfp.zregs[rd].d[0]; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < oprsz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ FN(env, addr, d[i], ra); \
+ } \
+ addr += sizeof(TYPEM); \
+ } \
+}
+
+#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
+ TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
+ FN(env, addr, m1, ra); \
+ FN(env, addr + sizeof(TYPEM), m2, ra); \
+ } \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 2 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
+ TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
+ TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
+ FN(env, addr, m1, ra); \
+ FN(env, addr + sizeof(TYPEM), m2, ra); \
+ FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
+ } \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 3 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
+void HELPER(NAME)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ intptr_t ra = GETPC(); \
+ unsigned rd = simd_data(desc); \
+ void *d1 = &env->vfp.zregs[rd]; \
+ void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
+ void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
+ void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
+ TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
+ TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
+ TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
+ FN(env, addr, m1, ra); \
+ FN(env, addr + sizeof(TYPEM), m2, ra); \
+ FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
+ FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
+ } \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ addr += 4 * sizeof(TYPEM); \
+ } while (i & 15); \
+ } \
+}
+
+DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
+DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
+DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
+
+DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
+DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
+
+DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
+
+DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
+DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
+DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
+DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
+
+DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
+DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
+DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
+DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
+
+DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
+DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
+DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
+DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
+
+DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
+
+void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
+ target_ulong addr, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc) / 8;
+ intptr_t ra = GETPC();
+ unsigned rd = simd_data(desc);
+ uint64_t *d1 = &env->vfp.zregs[rd].d[0];
+ uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
+ uint8_t *pg = vg;
+
+ for (i = 0; i < oprsz; i += 1) {
+ if (pg[H1(i)] & 1) {
+ cpu_stq_data_ra(env, addr, d1[i], ra);
+ cpu_stq_data_ra(env, addr + 8, d2[i], ra);
+ }
+ addr += 2 * 8;
+ }
+}
+
+void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
+ target_ulong addr, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc) / 8;
+ intptr_t ra = GETPC();
+ unsigned rd = simd_data(desc);
+ uint64_t *d1 = &env->vfp.zregs[rd].d[0];
+ uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
+ uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
+ uint8_t *pg = vg;
+
+ for (i = 0; i < oprsz; i += 1) {
+ if (pg[H1(i)] & 1) {
+ cpu_stq_data_ra(env, addr, d1[i], ra);
+ cpu_stq_data_ra(env, addr + 8, d2[i], ra);
+ cpu_stq_data_ra(env, addr + 16, d3[i], ra);
+ }
+ addr += 3 * 8;
+ }
+}
+
+void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
+ target_ulong addr, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc) / 8;
+ intptr_t ra = GETPC();
+ unsigned rd = simd_data(desc);
+ uint64_t *d1 = &env->vfp.zregs[rd].d[0];
+ uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
+ uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
+ uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
+ uint8_t *pg = vg;
+
+ for (i = 0; i < oprsz; i += 1) {
+ if (pg[H1(i)] & 1) {
+ cpu_stq_data_ra(env, addr, d1[i], ra);
+ cpu_stq_data_ra(env, addr + 8, d2[i], ra);
+ cpu_stq_data_ra(env, addr + 16, d3[i], ra);
+ cpu_stq_data_ra(env, addr + 24, d4[i], ra);
+ }
+ addr += 4 * 8;
+ }
+}
+
+/* Loads with a vector index. */
+
+#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ unsigned scale = simd_data(desc); \
+ uintptr_t ra = GETPC(); \
+ for (i = 0; i < oprsz; i++) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m = 0; \
+ if (pg & 1) { \
+ target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
+ m = FN(env, base + (off << scale), ra); \
+ } \
+ *(uint32_t *)(vd + H1_4(i)) = m; \
+ i += 4, pg >>= 4; \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc) / 8; \
+ unsigned scale = simd_data(desc); \
+ uintptr_t ra = GETPC(); \
+ uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
+ for (i = 0; i < oprsz; i++) { \
+ TYPEM mm = 0; \
+ if (pg[H1(i)] & 1) { \
+ target_ulong off = (TYPEI)m[i]; \
+ mm = FN(env, base + (off << scale), ra); \
+ } \
+ d[i] = mm; \
+ } \
+}
+
+DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
+DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
+
+DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
+DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
+
+DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
+DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
+DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
+
+DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
+DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
+DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
+
+DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
+DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
+DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
+DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
+DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
+
+/* First fault loads with a vector index. */
+
+#ifdef CONFIG_USER_ONLY
+
+#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ unsigned scale = simd_data(desc); \
+ uintptr_t ra = GETPC(); \
+ bool first = true; \
+ mmap_lock(); \
+ for (i = 0; i < oprsz; i++) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPEM m = 0; \
+ if (pg & 1) { \
+ target_ulong off = *(TYPEI *)(vm + H(i)); \
+ target_ulong addr = base + (off << scale); \
+ if (!first && \
+ page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
+ record_fault(env, i, oprsz); \
+ goto exit; \
+ } \
+ m = FN(env, addr, ra); \
+ first = false; \
+ } \
+ *(TYPEE *)(vd + H(i)) = m; \
+ i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
+ } while (i & 15); \
+ } \
+ exit: \
+ mmap_unlock(); \
+}
+
+#else
+
+#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ g_assert_not_reached(); \
+}
+
+#endif
+
+#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
+ DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
+#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
+ DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
+
+DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
+
+DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
+
+DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
+
+DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
+
+DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
+DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
+
+/* Stores with a vector index. */
+
+#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ unsigned scale = simd_data(desc); \
+ uintptr_t ra = GETPC(); \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (likely(pg & 1)) { \
+ target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
+ uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
+ FN(env, base + (off << scale), d, ra); \
+ } \
+ i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
+ } while (i & 15); \
+ } \
+}
+
+#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
+void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
+ target_ulong base, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc) / 8; \
+ unsigned scale = simd_data(desc); \
+ uintptr_t ra = GETPC(); \
+ uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
+ for (i = 0; i < oprsz; i++) { \
+ if (likely(pg[H1(i)] & 1)) { \
+ target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
+ FN(env, base + off, d[i], ra); \
+ } \
+ } \
+}
+
+DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
+DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
+DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
+
+DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
+DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
+DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
+
+DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
+DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
+DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
+DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
+
+DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
+DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
+DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
+DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
+
+DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
+DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
+DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
+DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 8d8a4cecb0..45a6c2a3aa 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -640,6 +640,16 @@ static void gen_gvec_op3(DisasContext *s, bool is_q, int rd,
vec_full_reg_size(s), gvec_op);
}
+/* Expand a 3-operand operation using an out-of-line helper. */
+static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd,
+ int rn, int rm, int data, gen_helper_gvec_3 *fn)
+{
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+}
+
/* Expand a 3-operand + env pointer operation using
* an out-of-line helper.
*/
@@ -1623,11 +1633,10 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
default:
break;
}
- if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) {
- return;
- }
if ((ri->type & ARM_CP_FPU) && !fp_access_check(s)) {
return;
+ } else if ((ri->type & ARM_CP_SVE) && !sve_access_check(s)) {
+ return;
}
if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
@@ -11336,6 +11345,14 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
feature = ARM_FEATURE_V8_RDM;
break;
+ case 0x02: /* SDOT (vector) */
+ case 0x12: /* UDOT (vector) */
+ if (size != MO_32) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = ARM_FEATURE_V8_DOTPROD;
+ break;
case 0x8: /* FCMLA, #0 */
case 0x9: /* FCMLA, #90 */
case 0xa: /* FCMLA, #180 */
@@ -11389,6 +11406,11 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
return;
+ case 0x2: /* SDOT / UDOT */
+ gen_gvec_op3_ool(s, is_q, rd, rn, rm, 0,
+ u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b);
+ return;
+
case 0x8: /* FCMLA, #0 */
case 0x9: /* FCMLA, #90 */
case 0xa: /* FCMLA, #180 */
@@ -12568,6 +12590,13 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
return;
}
break;
+ case 0x0e: /* SDOT */
+ case 0x1e: /* UDOT */
+ if (size != MO_32 || !arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
case 0x11: /* FCMLA #0 */
case 0x13: /* FCMLA #90 */
case 0x15: /* FCMLA #180 */
@@ -12665,19 +12694,28 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
}
switch (16 * u + opcode) {
+ case 0x0e: /* SDOT */
+ case 0x1e: /* UDOT */
+ gen_gvec_op3_ool(s, is_q, rd, rn, rm, index,
+ u ? gen_helper_gvec_udot_idx_b
+ : gen_helper_gvec_sdot_idx_b);
+ return;
case 0x11: /* FCMLA #0 */
case 0x13: /* FCMLA #90 */
case 0x15: /* FCMLA #180 */
case 0x17: /* FCMLA #270 */
- tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
- vec_full_reg_offset(s, rn),
- vec_reg_offset(s, rm, index, size), fpst,
- is_q ? 16 : 8, vec_full_reg_size(s),
- extract32(insn, 13, 2), /* rot */
- size == MO_64
- ? gen_helper_gvec_fcmlas_idx
- : gen_helper_gvec_fcmlah_idx);
- tcg_temp_free_ptr(fpst);
+ {
+ int rot = extract32(insn, 13, 2);
+ int data = (index << 2) | rot;
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s), data,
+ size == MO_64
+ ? gen_helper_gvec_fcmlas_idx
+ : gen_helper_gvec_fcmlah_idx);
+ tcg_temp_free_ptr(fpst);
+ }
return;
}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 226c97579c..c080345b9c 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -32,6 +32,7 @@
#include "exec/log.h"
#include "trace-tcg.h"
#include "translate-a64.h"
+#include "fpu/softfloat.h"
typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t,
@@ -42,6 +43,10 @@ typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr,
typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr,
TCGv_ptr, TCGv_ptr, TCGv_i32);
+typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32);
+typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i64, TCGv_i32);
+
/*
* Helpers for extracting complex instruction fields.
*/
@@ -82,6 +87,15 @@ static inline int expand_imm_sh8u(int x)
return (uint8_t)x << (x & 0x100 ? 8 : 0);
}
+/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype)
+ * with unsigned data. C.f. SVE Memory Contiguous Load Group.
+ */
+static inline int msz_dtype(int msz)
+{
+ static const uint8_t dtype[4] = { 0, 5, 10, 15 };
+ return dtype[msz];
+}
+
/*
* Include the generated decoder.
*/
@@ -337,6 +351,23 @@ static bool do_zpzz_ool(DisasContext *s, arg_rprr_esz *a, gen_helper_gvec_4 *fn)
return true;
}
+/* Select active elememnts from Zn and inactive elements from Zm,
+ * storing the result in Zd.
+ */
+static void do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz)
+{
+ static gen_helper_gvec_4 * const fns[4] = {
+ gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h,
+ gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ pred_full_reg_offset(s, pg),
+ vsz, vsz, 0, fns[esz]);
+}
+
#define DO_ZPZZ(NAME, name) \
static bool trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, \
uint32_t insn) \
@@ -387,7 +418,13 @@ static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
return do_zpzz_ool(s, a, fns[a->esz]);
}
-DO_ZPZZ(SEL, sel)
+static bool trans_SEL_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ do_sel_z(s, a->rd, a->rn, a->rm, a->pg, a->esz);
+ }
+ return true;
+}
#undef DO_ZPZZ
@@ -595,6 +632,20 @@ static bool do_clr_zp(DisasContext *s, int rd, int pg, int esz)
return true;
}
+/* Copy Zn into Zd, storing zeros into inactive elements. */
+static void do_movz_zpz(DisasContext *s, int rd, int rn, int pg, int esz)
+{
+ static gen_helper_gvec_3 * const fns[4] = {
+ gen_helper_sve_movz_b, gen_helper_sve_movz_h,
+ gen_helper_sve_movz_s, gen_helper_sve_movz_d,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ pred_full_reg_offset(s, pg),
+ vsz, vsz, 0, fns[esz]);
+}
+
static bool do_zpzi_ool(DisasContext *s, arg_rpri_esz *a,
gen_helper_gvec_3 *fn)
{
@@ -3372,6 +3423,310 @@ DO_ZZI(UMIN, umin)
#undef DO_ZZI
+static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a, uint32_t insn)
+{
+ static gen_helper_gvec_3 * const fns[2][2] = {
+ { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h },
+ { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h }
+ };
+
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vsz, vsz, 0, fns[a->u][a->sz]);
+ }
+ return true;
+}
+
+static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a, uint32_t insn)
+{
+ static gen_helper_gvec_3 * const fns[2][2] = {
+ { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h },
+ { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h }
+ };
+
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vsz, vsz, a->index, fns[a->u][a->sz]);
+ }
+ return true;
+}
+
+
+/*
+ *** SVE Floating Point Multiply-Add Indexed Group
+ */
+
+static bool trans_FMLA_zzxz(DisasContext *s, arg_FMLA_zzxz *a, uint32_t insn)
+{
+ static gen_helper_gvec_4_ptr * const fns[3] = {
+ gen_helper_gvec_fmla_idx_h,
+ gen_helper_gvec_fmla_idx_s,
+ gen_helper_gvec_fmla_idx_d,
+ };
+
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vec_full_reg_offset(s, a->ra),
+ status, vsz, vsz, (a->index << 1) | a->sub,
+ fns[a->esz - 1]);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+/*
+ *** SVE Floating Point Multiply Indexed Group
+ */
+
+static bool trans_FMUL_zzx(DisasContext *s, arg_FMUL_zzx *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[3] = {
+ gen_helper_gvec_fmul_idx_h,
+ gen_helper_gvec_fmul_idx_s,
+ gen_helper_gvec_fmul_idx_d,
+ };
+
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ status, vsz, vsz, a->index, fns[a->esz - 1]);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+/*
+ *** SVE Floating Point Fast Reduction Group
+ */
+
+typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+
+static void do_reduce(DisasContext *s, arg_rpr_esz *a,
+ gen_helper_fp_reduce *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned p2vsz = pow2ceil(vsz);
+ TCGv_i32 t_desc = tcg_const_i32(simd_desc(vsz, p2vsz, 0));
+ TCGv_ptr t_zn, t_pg, status;
+ TCGv_i64 temp;
+
+ temp = tcg_temp_new_i64();
+ t_zn = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ status = get_fpstatus_ptr(a->esz == MO_16);
+
+ fn(temp, t_zn, t_pg, status, t_desc);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(status);
+ tcg_temp_free_i32(t_desc);
+
+ write_fp_dreg(s, a->rd, temp);
+ tcg_temp_free_i64(temp);
+}
+
+#define DO_VPZ(NAME, name) \
+static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \
+{ \
+ static gen_helper_fp_reduce * const fns[3] = { \
+ gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, \
+ gen_helper_sve_##name##_d, \
+ }; \
+ if (a->esz == 0) { \
+ return false; \
+ } \
+ if (sve_access_check(s)) { \
+ do_reduce(s, a, fns[a->esz - 1]); \
+ } \
+ return true; \
+}
+
+DO_VPZ(FADDV, faddv)
+DO_VPZ(FMINNMV, fminnmv)
+DO_VPZ(FMAXNMV, fmaxnmv)
+DO_VPZ(FMINV, fminv)
+DO_VPZ(FMAXV, fmaxv)
+
+/*
+ *** SVE Floating Point Unary Operations - Unpredicated Group
+ */
+
+static void do_zz_fp(DisasContext *s, arg_rr_esz *a, gen_helper_gvec_2_ptr *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+ tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+}
+
+static bool trans_FRECPE(DisasContext *s, arg_rr_esz *a, uint32_t insn)
+{
+ static gen_helper_gvec_2_ptr * const fns[3] = {
+ gen_helper_gvec_frecpe_h,
+ gen_helper_gvec_frecpe_s,
+ gen_helper_gvec_frecpe_d,
+ };
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_zz_fp(s, a, fns[a->esz - 1]);
+ }
+ return true;
+}
+
+static bool trans_FRSQRTE(DisasContext *s, arg_rr_esz *a, uint32_t insn)
+{
+ static gen_helper_gvec_2_ptr * const fns[3] = {
+ gen_helper_gvec_frsqrte_h,
+ gen_helper_gvec_frsqrte_s,
+ gen_helper_gvec_frsqrte_d,
+ };
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ do_zz_fp(s, a, fns[a->esz - 1]);
+ }
+ return true;
+}
+
+/*
+ *** SVE Floating Point Compare with Zero Group
+ */
+
+static void do_ppz_fp(DisasContext *s, arg_rpr_esz *a,
+ gen_helper_gvec_3_ptr *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+ tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+}
+
+#define DO_PPZ(NAME, name) \
+static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \
+{ \
+ static gen_helper_gvec_3_ptr * const fns[3] = { \
+ gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, \
+ gen_helper_sve_##name##_d, \
+ }; \
+ if (a->esz == 0) { \
+ return false; \
+ } \
+ if (sve_access_check(s)) { \
+ do_ppz_fp(s, a, fns[a->esz - 1]); \
+ } \
+ return true; \
+}
+
+DO_PPZ(FCMGE_ppz0, fcmge0)
+DO_PPZ(FCMGT_ppz0, fcmgt0)
+DO_PPZ(FCMLE_ppz0, fcmle0)
+DO_PPZ(FCMLT_ppz0, fcmlt0)
+DO_PPZ(FCMEQ_ppz0, fcmeq0)
+DO_PPZ(FCMNE_ppz0, fcmne0)
+
+#undef DO_PPZ
+
+/*
+ *** SVE floating-point trig multiply-add coefficient
+ */
+
+static bool trans_FTMAD(DisasContext *s, arg_FTMAD *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[3] = {
+ gen_helper_sve_ftmad_h,
+ gen_helper_sve_ftmad_s,
+ gen_helper_sve_ftmad_d,
+ };
+
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ status, vsz, vsz, a->imm, fns[a->esz - 1]);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+/*
+ *** SVE Floating Point Accumulating Reduction Group
+ */
+
+static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a, uint32_t insn)
+{
+ typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr,
+ TCGv_ptr, TCGv_ptr, TCGv_i32);
+ static fadda_fn * const fns[3] = {
+ gen_helper_sve_fadda_h,
+ gen_helper_sve_fadda_s,
+ gen_helper_sve_fadda_d,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_rm, t_pg, t_fpst;
+ TCGv_i64 t_val;
+ TCGv_i32 t_desc;
+
+ if (a->esz == 0) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz);
+ t_rm = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ t_fpst = get_fpstatus_ptr(a->esz == MO_16);
+ t_desc = tcg_const_i32(simd_desc(vsz, vsz, 0));
+
+ fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc);
+
+ tcg_temp_free_i32(t_desc);
+ tcg_temp_free_ptr(t_fpst);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(t_rm);
+
+ write_fp_dreg(s, a->rd, t_val);
+ tcg_temp_free_i64(t_val);
+ return true;
+}
+
/*
*** SVE Floating Point Arithmetic - Unpredicated Group
*/
@@ -3415,6 +3770,592 @@ DO_FP3(FRSQRTS, rsqrts)
#undef DO_FP3
/*
+ *** SVE Floating Point Arithmetic - Predicated Group
+ */
+
+static bool do_zpzz_fp(DisasContext *s, arg_rprr_esz *a,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+#define DO_FP3(NAME, name) \
+static bool trans_##NAME(DisasContext *s, arg_rprr_esz *a, uint32_t insn) \
+{ \
+ static gen_helper_gvec_4_ptr * const fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \
+ }; \
+ return do_zpzz_fp(s, a, fns[a->esz]); \
+}
+
+DO_FP3(FADD_zpzz, fadd)
+DO_FP3(FSUB_zpzz, fsub)
+DO_FP3(FMUL_zpzz, fmul)
+DO_FP3(FMIN_zpzz, fmin)
+DO_FP3(FMAX_zpzz, fmax)
+DO_FP3(FMINNM_zpzz, fminnum)
+DO_FP3(FMAXNM_zpzz, fmaxnum)
+DO_FP3(FABD, fabd)
+DO_FP3(FSCALE, fscalbn)
+DO_FP3(FDIV, fdiv)
+DO_FP3(FMULX, fmulx)
+
+#undef DO_FP3
+
+typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_i64, TCGv_ptr, TCGv_i32);
+
+static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16,
+ TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_zd, t_zn, t_pg, status;
+ TCGv_i32 desc;
+
+ t_zd = tcg_temp_new_ptr();
+ t_zn = tcg_temp_new_ptr();
+ t_pg = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd));
+ tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn));
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+
+ status = get_fpstatus_ptr(is_fp16);
+ desc = tcg_const_i32(simd_desc(vsz, vsz, 0));
+ fn(t_zd, t_zn, t_pg, scalar, status, desc);
+
+ tcg_temp_free_i32(desc);
+ tcg_temp_free_ptr(status);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_ptr(t_zn);
+ tcg_temp_free_ptr(t_zd);
+}
+
+static void do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm,
+ gen_helper_sve_fp2scalar *fn)
+{
+ TCGv_i64 temp = tcg_const_i64(imm);
+ do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16, temp, fn);
+ tcg_temp_free_i64(temp);
+}
+
+#define DO_FP_IMM(NAME, name, const0, const1) \
+static bool trans_##NAME##_zpzi(DisasContext *s, arg_rpri_esz *a, \
+ uint32_t insn) \
+{ \
+ static gen_helper_sve_fp2scalar * const fns[3] = { \
+ gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, \
+ gen_helper_sve_##name##_d \
+ }; \
+ static uint64_t const val[3][2] = { \
+ { float16_##const0, float16_##const1 }, \
+ { float32_##const0, float32_##const1 }, \
+ { float64_##const0, float64_##const1 }, \
+ }; \
+ if (a->esz == 0) { \
+ return false; \
+ } \
+ if (sve_access_check(s)) { \
+ do_fp_imm(s, a, val[a->esz - 1][a->imm], fns[a->esz - 1]); \
+ } \
+ return true; \
+}
+
+#define float16_two make_float16(0x4000)
+#define float32_two make_float32(0x40000000)
+#define float64_two make_float64(0x4000000000000000ULL)
+
+DO_FP_IMM(FADD, fadds, half, one)
+DO_FP_IMM(FSUB, fsubs, half, one)
+DO_FP_IMM(FMUL, fmuls, half, two)
+DO_FP_IMM(FSUBR, fsubrs, half, one)
+DO_FP_IMM(FMAXNM, fmaxnms, zero, one)
+DO_FP_IMM(FMINNM, fminnms, zero, one)
+DO_FP_IMM(FMAX, fmaxs, zero, one)
+DO_FP_IMM(FMIN, fmins, zero, one)
+
+#undef DO_FP_IMM
+
+static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a,
+ gen_helper_gvec_4_ptr *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+#define DO_FPCMP(NAME, name) \
+static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \
+ uint32_t insn) \
+{ \
+ static gen_helper_gvec_4_ptr * const fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \
+ }; \
+ return do_fp_cmp(s, a, fns[a->esz]); \
+}
+
+DO_FPCMP(FCMGE, fcmge)
+DO_FPCMP(FCMGT, fcmgt)
+DO_FPCMP(FCMEQ, fcmeq)
+DO_FPCMP(FCMNE, fcmne)
+DO_FPCMP(FCMUO, fcmuo)
+DO_FPCMP(FACGE, facge)
+DO_FPCMP(FACGT, facgt)
+
+#undef DO_FPCMP
+
+static bool trans_FCADD(DisasContext *s, arg_FCADD *a, uint32_t insn)
+{
+ static gen_helper_gvec_4_ptr * const fns[3] = {
+ gen_helper_sve_fcadd_h,
+ gen_helper_sve_fcadd_s,
+ gen_helper_sve_fcadd_d
+ };
+
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, a->rot, fns[a->esz - 1]);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+typedef void gen_helper_sve_fmla(TCGv_env, TCGv_ptr, TCGv_i32);
+
+static bool do_fmla(DisasContext *s, arg_rprrr_esz *a, gen_helper_sve_fmla *fn)
+{
+ if (fn == NULL) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned desc;
+ TCGv_i32 t_desc;
+ TCGv_ptr pg = tcg_temp_new_ptr();
+
+ /* We would need 7 operands to pass these arguments "properly".
+ * So we encode all the register numbers into the descriptor.
+ */
+ desc = deposit32(a->rd, 5, 5, a->rn);
+ desc = deposit32(desc, 10, 5, a->rm);
+ desc = deposit32(desc, 15, 5, a->ra);
+ desc = simd_desc(vsz, vsz, desc);
+
+ t_desc = tcg_const_i32(desc);
+ tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ fn(cpu_env, pg, t_desc);
+ tcg_temp_free_i32(t_desc);
+ tcg_temp_free_ptr(pg);
+ return true;
+}
+
+#define DO_FMLA(NAME, name) \
+static bool trans_##NAME(DisasContext *s, arg_rprrr_esz *a, uint32_t insn) \
+{ \
+ static gen_helper_sve_fmla * const fns[4] = { \
+ NULL, gen_helper_sve_##name##_h, \
+ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \
+ }; \
+ return do_fmla(s, a, fns[a->esz]); \
+}
+
+DO_FMLA(FMLA_zpzzz, fmla_zpzzz)
+DO_FMLA(FMLS_zpzzz, fmls_zpzzz)
+DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz)
+DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz)
+
+#undef DO_FMLA
+
+static bool trans_FCMLA_zpzzz(DisasContext *s,
+ arg_FCMLA_zpzzz *a, uint32_t insn)
+{
+ static gen_helper_sve_fmla * const fns[3] = {
+ gen_helper_sve_fcmla_zpzzz_h,
+ gen_helper_sve_fcmla_zpzzz_s,
+ gen_helper_sve_fcmla_zpzzz_d,
+ };
+
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned desc;
+ TCGv_i32 t_desc;
+ TCGv_ptr pg = tcg_temp_new_ptr();
+
+ /* We would need 7 operands to pass these arguments "properly".
+ * So we encode all the register numbers into the descriptor.
+ */
+ desc = deposit32(a->rd, 5, 5, a->rn);
+ desc = deposit32(desc, 10, 5, a->rm);
+ desc = deposit32(desc, 15, 5, a->ra);
+ desc = deposit32(desc, 20, 2, a->rot);
+ desc = sextract32(desc, 0, 22);
+ desc = simd_desc(vsz, vsz, desc);
+
+ t_desc = tcg_const_i32(desc);
+ tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg));
+ fns[a->esz - 1](cpu_env, pg, t_desc);
+ tcg_temp_free_i32(t_desc);
+ tcg_temp_free_ptr(pg);
+ }
+ return true;
+}
+
+static bool trans_FCMLA_zzxz(DisasContext *s, arg_FCMLA_zzxz *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[2] = {
+ gen_helper_gvec_fcmlah_idx,
+ gen_helper_gvec_fcmlas_idx,
+ };
+
+ tcg_debug_assert(a->esz == 1 || a->esz == 2);
+ tcg_debug_assert(a->rd == a->ra);
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ status, vsz, vsz,
+ a->index * 4 + a->rot,
+ fns[a->esz - 1]);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+/*
+ *** SVE Floating Point Unary Operations Predicated Group
+ */
+
+static bool do_zpz_ptr(DisasContext *s, int rd, int rn, int pg,
+ bool is_fp16, gen_helper_gvec_3_ptr *fn)
+{
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(is_fp16);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ pred_full_reg_offset(s, pg),
+ status, vsz, vsz, 0, fn);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool trans_FCVT_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_sh);
+}
+
+static bool trans_FCVT_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hs);
+}
+
+static bool trans_FCVT_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_dh);
+}
+
+static bool trans_FCVT_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hd);
+}
+
+static bool trans_FCVT_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_ds);
+}
+
+static bool trans_FCVT_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_sd);
+}
+
+static bool trans_FCVTZS_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hh);
+}
+
+static bool trans_FCVTZU_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hh);
+}
+
+static bool trans_FCVTZS_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hs);
+}
+
+static bool trans_FCVTZU_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hs);
+}
+
+static bool trans_FCVTZS_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hd);
+}
+
+static bool trans_FCVTZU_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hd);
+}
+
+static bool trans_FCVTZS_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ss);
+}
+
+static bool trans_FCVTZU_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ss);
+}
+
+static bool trans_FCVTZS_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_sd);
+}
+
+static bool trans_FCVTZU_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_sd);
+}
+
+static bool trans_FCVTZS_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ds);
+}
+
+static bool trans_FCVTZU_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ds);
+}
+
+static bool trans_FCVTZS_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_dd);
+}
+
+static bool trans_FCVTZU_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_dd);
+}
+
+static gen_helper_gvec_3_ptr * const frint_fns[3] = {
+ gen_helper_sve_frint_h,
+ gen_helper_sve_frint_s,
+ gen_helper_sve_frint_d
+};
+
+static bool trans_FRINTI(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ if (a->esz == 0) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16,
+ frint_fns[a->esz - 1]);
+}
+
+static bool trans_FRINTX(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[3] = {
+ gen_helper_sve_frintx_h,
+ gen_helper_sve_frintx_s,
+ gen_helper_sve_frintx_d
+ };
+ if (a->esz == 0) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+
+static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a, int mode)
+{
+ if (a->esz == 0) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_i32 tmode = tcg_const_i32(mode);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+
+ gen_helper_set_rmode(tmode, tmode, status);
+
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ pred_full_reg_offset(s, a->pg),
+ status, vsz, vsz, 0, frint_fns[a->esz - 1]);
+
+ gen_helper_set_rmode(tmode, tmode, status);
+ tcg_temp_free_i32(tmode);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool trans_FRINTN(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_frint_mode(s, a, float_round_nearest_even);
+}
+
+static bool trans_FRINTP(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_frint_mode(s, a, float_round_up);
+}
+
+static bool trans_FRINTM(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_frint_mode(s, a, float_round_down);
+}
+
+static bool trans_FRINTZ(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_frint_mode(s, a, float_round_to_zero);
+}
+
+static bool trans_FRINTA(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_frint_mode(s, a, float_round_ties_away);
+}
+
+static bool trans_FRECPX(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[3] = {
+ gen_helper_sve_frecpx_h,
+ gen_helper_sve_frecpx_s,
+ gen_helper_sve_frecpx_d
+ };
+ if (a->esz == 0) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+
+static bool trans_FSQRT(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ static gen_helper_gvec_3_ptr * const fns[3] = {
+ gen_helper_sve_fsqrt_h,
+ gen_helper_sve_fsqrt_s,
+ gen_helper_sve_fsqrt_d
+ };
+ if (a->esz == 0) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]);
+}
+
+static bool trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh);
+}
+
+static bool trans_SCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_sh);
+}
+
+static bool trans_SCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_dh);
+}
+
+static bool trans_SCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ss);
+}
+
+static bool trans_SCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ds);
+}
+
+static bool trans_SCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_sd);
+}
+
+static bool trans_SCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_dd);
+}
+
+static bool trans_UCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_hh);
+}
+
+static bool trans_UCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_sh);
+}
+
+static bool trans_UCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_dh);
+}
+
+static bool trans_UCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ss);
+}
+
+static bool trans_UCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ds);
+}
+
+static bool trans_UCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_sd);
+}
+
+static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_dd);
+}
+
+/*
*** SVE Memory - 32-bit Gather and Unsized Contiguous Group
*/
@@ -3507,6 +4448,89 @@ static void do_ldr(DisasContext *s, uint32_t vofs, uint32_t len,
tcg_temp_free_i64(t0);
}
+/* Similarly for stores. */
+static void do_str(DisasContext *s, uint32_t vofs, uint32_t len,
+ int rn, int imm)
+{
+ uint32_t len_align = QEMU_ALIGN_DOWN(len, 8);
+ uint32_t len_remain = len % 8;
+ uint32_t nparts = len / 8 + ctpop8(len_remain);
+ int midx = get_mem_index(s);
+ TCGv_i64 addr, t0;
+
+ addr = tcg_temp_new_i64();
+ t0 = tcg_temp_new_i64();
+
+ /* Note that unpredicated load/store of vector/predicate registers
+ * are defined as a stream of bytes, which equates to little-endian
+ * operations on larger quantities. There is no nice way to force
+ * a little-endian store for aarch64_be-linux-user out of line.
+ *
+ * Attempt to keep code expansion to a minimum by limiting the
+ * amount of unrolling done.
+ */
+ if (nparts <= 4) {
+ int i;
+
+ for (i = 0; i < len_align; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, vofs + i);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i);
+ tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
+ }
+ } else {
+ TCGLabel *loop = gen_new_label();
+ TCGv_ptr t2, i = tcg_const_local_ptr(0);
+
+ gen_set_label(loop);
+
+ t2 = tcg_temp_new_ptr();
+ tcg_gen_add_ptr(t2, cpu_env, i);
+ tcg_gen_ld_i64(t0, t2, vofs);
+
+ /* Minimize the number of local temps that must be re-read from
+ * the stack each iteration. Instead, re-compute values other
+ * than the loop counter.
+ */
+ tcg_gen_addi_ptr(t2, i, imm);
+ tcg_gen_extu_ptr_i64(addr, t2);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn));
+ tcg_temp_free_ptr(t2);
+
+ tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ);
+
+ tcg_gen_addi_ptr(i, i, 8);
+
+ tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop);
+ tcg_temp_free_ptr(i);
+ }
+
+ /* Predicate register stores can be any multiple of 2. */
+ if (len_remain) {
+ tcg_gen_ld_i64(t0, cpu_env, vofs + len_align);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align);
+
+ switch (len_remain) {
+ case 2:
+ case 4:
+ case 8:
+ tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain));
+ break;
+
+ case 6:
+ tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL);
+ tcg_gen_addi_i64(addr, addr, 4);
+ tcg_gen_shri_i64(t0, t0, 32);
+ tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ }
+ tcg_temp_free_i64(addr);
+ tcg_temp_free_i64(t0);
+}
+
static bool trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn)
{
if (sve_access_check(s)) {
@@ -3526,3 +4550,665 @@ static bool trans_LDR_pri(DisasContext *s, arg_rri *a, uint32_t insn)
}
return true;
}
+
+static bool trans_STR_zri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ int size = vec_full_reg_size(s);
+ int off = vec_full_reg_offset(s, a->rd);
+ do_str(s, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+static bool trans_STR_pri(DisasContext *s, arg_rri *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ int size = pred_full_reg_size(s);
+ int off = pred_full_reg_offset(s, a->rd);
+ do_str(s, off, size, a->rn, a->imm * size);
+ }
+ return true;
+}
+
+/*
+ *** SVE Memory - Contiguous Load Group
+ */
+
+/* The memory mode of the dtype. */
+static const TCGMemOp dtype_mop[16] = {
+ MO_UB, MO_UB, MO_UB, MO_UB,
+ MO_SL, MO_UW, MO_UW, MO_UW,
+ MO_SW, MO_SW, MO_UL, MO_UL,
+ MO_SB, MO_SB, MO_SB, MO_Q
+};
+
+#define dtype_msz(x) (dtype_mop[x] & MO_SIZE)
+
+/* The vector element size of dtype. */
+static const uint8_t dtype_esz[16] = {
+ 0, 1, 2, 3,
+ 3, 1, 2, 3,
+ 3, 2, 2, 3,
+ 3, 2, 1, 3
+};
+
+static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+ gen_helper_gvec_mem *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_pg;
+ TCGv_i32 desc;
+
+ /* For e.g. LD4, there are not enough arguments to pass all 4
+ * registers as pointers, so encode the regno into the data field.
+ * For consistency, do this even for LD1.
+ */
+ desc = tcg_const_i32(simd_desc(vsz, vsz, zt));
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+ fn(cpu_env, t_pg, addr, desc);
+
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_i32(desc);
+}
+
+static void do_ld_zpa(DisasContext *s, int zt, int pg,
+ TCGv_i64 addr, int dtype, int nreg)
+{
+ static gen_helper_gvec_mem * const fns[16][4] = {
+ { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r,
+ gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r },
+ { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1sds_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hh_r, gen_helper_sve_ld2hh_r,
+ gen_helper_sve_ld3hh_r, gen_helper_sve_ld4hh_r },
+ { gen_helper_sve_ld1hsu_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hdu_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1hds_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1hss_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1ss_r, gen_helper_sve_ld2ss_r,
+ gen_helper_sve_ld3ss_r, gen_helper_sve_ld4ss_r },
+ { gen_helper_sve_ld1sdu_r, NULL, NULL, NULL },
+
+ { gen_helper_sve_ld1bds_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bss_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL },
+ { gen_helper_sve_ld1dd_r, gen_helper_sve_ld2dd_r,
+ gen_helper_sve_ld3dd_r, gen_helper_sve_ld4dd_r },
+ };
+ gen_helper_gvec_mem *fn = fns[dtype][nreg];
+
+ /* While there are holes in the table, they are not
+ * accessible via the instruction encoding.
+ */
+ assert(fn != NULL);
+ do_mem_zpa(s, zt, pg, addr, fn);
+}
+
+static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
+{
+ if (a->rm == 31) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_muli_i64(addr, cpu_reg(s, a->rm),
+ (a->nreg + 1) << dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> dtype_esz[a->dtype];
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+ (a->imm * elements * (a->nreg + 1))
+ << dtype_msz(a->dtype));
+ do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
+{
+ static gen_helper_gvec_mem * const fns[16] = {
+ gen_helper_sve_ldff1bb_r,
+ gen_helper_sve_ldff1bhu_r,
+ gen_helper_sve_ldff1bsu_r,
+ gen_helper_sve_ldff1bdu_r,
+
+ gen_helper_sve_ldff1sds_r,
+ gen_helper_sve_ldff1hh_r,
+ gen_helper_sve_ldff1hsu_r,
+ gen_helper_sve_ldff1hdu_r,
+
+ gen_helper_sve_ldff1hds_r,
+ gen_helper_sve_ldff1hss_r,
+ gen_helper_sve_ldff1ss_r,
+ gen_helper_sve_ldff1sdu_r,
+
+ gen_helper_sve_ldff1bds_r,
+ gen_helper_sve_ldff1bss_r,
+ gen_helper_sve_ldff1bhs_r,
+ gen_helper_sve_ldff1dd_r,
+ };
+
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype));
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
+ }
+ return true;
+}
+
+static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+ static gen_helper_gvec_mem * const fns[16] = {
+ gen_helper_sve_ldnf1bb_r,
+ gen_helper_sve_ldnf1bhu_r,
+ gen_helper_sve_ldnf1bsu_r,
+ gen_helper_sve_ldnf1bdu_r,
+
+ gen_helper_sve_ldnf1sds_r,
+ gen_helper_sve_ldnf1hh_r,
+ gen_helper_sve_ldnf1hsu_r,
+ gen_helper_sve_ldnf1hdu_r,
+
+ gen_helper_sve_ldnf1hds_r,
+ gen_helper_sve_ldnf1hss_r,
+ gen_helper_sve_ldnf1ss_r,
+ gen_helper_sve_ldnf1sdu_r,
+
+ gen_helper_sve_ldnf1bds_r,
+ gen_helper_sve_ldnf1bss_r,
+ gen_helper_sve_ldnf1bhs_r,
+ gen_helper_sve_ldnf1dd_r,
+ };
+
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> dtype_esz[a->dtype];
+ int off = (a->imm * elements) << dtype_msz(a->dtype);
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off);
+ do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]);
+ }
+ return true;
+}
+
+static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz)
+{
+ static gen_helper_gvec_mem * const fns[4] = {
+ gen_helper_sve_ld1bb_r, gen_helper_sve_ld1hh_r,
+ gen_helper_sve_ld1ss_r, gen_helper_sve_ld1dd_r,
+ };
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr t_pg;
+ TCGv_i32 desc;
+
+ /* Load the first quadword using the normal predicated load helpers. */
+ desc = tcg_const_i32(simd_desc(16, 16, zt));
+ t_pg = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+ fns[msz](cpu_env, t_pg, addr, desc);
+
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_i32(desc);
+
+ /* Replicate that first quadword. */
+ if (vsz > 16) {
+ unsigned dofs = vec_full_reg_offset(s, zt);
+ tcg_gen_gvec_dup_mem(4, dofs + 16, dofs, vsz - 16, vsz - 16);
+ }
+}
+
+static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn)
+{
+ if (a->rm == 31) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int msz = dtype_msz(a->dtype);
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_ldrq(s, a->rd, a->pg, addr, msz);
+ }
+ return true;
+}
+
+static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16);
+ do_ldrq(s, a->rd, a->pg, addr, dtype_msz(a->dtype));
+ }
+ return true;
+}
+
+/* Load and broadcast element. */
+static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn)
+{
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ unsigned vsz = vec_full_reg_size(s);
+ unsigned psz = pred_full_reg_size(s);
+ unsigned esz = dtype_esz[a->dtype];
+ TCGLabel *over = gen_new_label();
+ TCGv_i64 temp;
+
+ /* If the guarding predicate has no bits set, no load occurs. */
+ if (psz <= 8) {
+ /* Reduce the pred_esz_masks value simply to reduce the
+ * size of the code generated here.
+ */
+ uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8);
+ temp = tcg_temp_new_i64();
+ tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg));
+ tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask);
+ tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over);
+ tcg_temp_free_i64(temp);
+ } else {
+ TCGv_i32 t32 = tcg_temp_new_i32();
+ find_last_active(s, t32, esz, a->pg);
+ tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over);
+ tcg_temp_free_i32(t32);
+ }
+
+ /* Load the data. */
+ temp = tcg_temp_new_i64();
+ tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << esz);
+ tcg_gen_qemu_ld_i64(temp, temp, get_mem_index(s),
+ s->be_data | dtype_mop[a->dtype]);
+
+ /* Broadcast to *all* elements. */
+ tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd),
+ vsz, vsz, temp);
+ tcg_temp_free_i64(temp);
+
+ /* Zero the inactive elements. */
+ gen_set_label(over);
+ do_movz_zpz(s, a->rd, a->rd, a->pg, esz);
+ return true;
+}
+
+static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr,
+ int msz, int esz, int nreg)
+{
+ static gen_helper_gvec_mem * const fn_single[4][4] = {
+ { gen_helper_sve_st1bb_r, gen_helper_sve_st1bh_r,
+ gen_helper_sve_st1bs_r, gen_helper_sve_st1bd_r },
+ { NULL, gen_helper_sve_st1hh_r,
+ gen_helper_sve_st1hs_r, gen_helper_sve_st1hd_r },
+ { NULL, NULL,
+ gen_helper_sve_st1ss_r, gen_helper_sve_st1sd_r },
+ { NULL, NULL, NULL, gen_helper_sve_st1dd_r },
+ };
+ static gen_helper_gvec_mem * const fn_multiple[3][4] = {
+ { gen_helper_sve_st2bb_r, gen_helper_sve_st2hh_r,
+ gen_helper_sve_st2ss_r, gen_helper_sve_st2dd_r },
+ { gen_helper_sve_st3bb_r, gen_helper_sve_st3hh_r,
+ gen_helper_sve_st3ss_r, gen_helper_sve_st3dd_r },
+ { gen_helper_sve_st4bb_r, gen_helper_sve_st4hh_r,
+ gen_helper_sve_st4ss_r, gen_helper_sve_st4dd_r },
+ };
+ gen_helper_gvec_mem *fn;
+
+ if (nreg == 0) {
+ /* ST1 */
+ fn = fn_single[msz][esz];
+ } else {
+ /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */
+ assert(msz == esz);
+ fn = fn_multiple[nreg - 1][msz];
+ }
+ assert(fn != NULL);
+ do_mem_zpa(s, zt, pg, addr, fn);
+}
+
+static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a, uint32_t insn)
+{
+ if (a->rm == 31 || a->msz > a->esz) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_i64 addr = new_tmp_a64(s);
+ tcg_gen_muli_i64(addr, cpu_reg(s, a->rm), (a->nreg + 1) << a->msz);
+ tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn));
+ do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+ }
+ return true;
+}
+
+static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a, uint32_t insn)
+{
+ if (a->msz > a->esz) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ int vsz = vec_full_reg_size(s);
+ int elements = vsz >> a->esz;
+ TCGv_i64 addr = new_tmp_a64(s);
+
+ tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn),
+ (a->imm * elements * (a->nreg + 1)) << a->msz);
+ do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg);
+ }
+ return true;
+}
+
+/*
+ *** SVE gather loads / scatter stores
+ */
+
+static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, int scale,
+ TCGv_i64 scalar, gen_helper_gvec_mem_scatter *fn)
+{
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, scale));
+ TCGv_ptr t_zm = tcg_temp_new_ptr();
+ TCGv_ptr t_pg = tcg_temp_new_ptr();
+ TCGv_ptr t_zt = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg));
+ tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm));
+ tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt));
+ fn(cpu_env, t_zt, t_pg, t_zm, scalar, desc);
+
+ tcg_temp_free_ptr(t_zt);
+ tcg_temp_free_ptr(t_zm);
+ tcg_temp_free_ptr(t_pg);
+ tcg_temp_free_i32(desc);
+}
+
+/* Indexed by [ff][xs][u][msz]. */
+static gen_helper_gvec_mem_scatter * const gather_load_fn32[2][2][2][3] = {
+ { { { gen_helper_sve_ldbss_zsu,
+ gen_helper_sve_ldhss_zsu,
+ NULL, },
+ { gen_helper_sve_ldbsu_zsu,
+ gen_helper_sve_ldhsu_zsu,
+ gen_helper_sve_ldssu_zsu, } },
+ { { gen_helper_sve_ldbss_zss,
+ gen_helper_sve_ldhss_zss,
+ NULL, },
+ { gen_helper_sve_ldbsu_zss,
+ gen_helper_sve_ldhsu_zss,
+ gen_helper_sve_ldssu_zss, } } },
+
+ { { { gen_helper_sve_ldffbss_zsu,
+ gen_helper_sve_ldffhss_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zsu,
+ gen_helper_sve_ldffhsu_zsu,
+ gen_helper_sve_ldffssu_zsu, } },
+ { { gen_helper_sve_ldffbss_zss,
+ gen_helper_sve_ldffhss_zss,
+ NULL, },
+ { gen_helper_sve_ldffbsu_zss,
+ gen_helper_sve_ldffhsu_zss,
+ gen_helper_sve_ldffssu_zss, } } }
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset. */
+static gen_helper_gvec_mem_scatter * const gather_load_fn64[2][3][2][4] = {
+ { { { gen_helper_sve_ldbds_zsu,
+ gen_helper_sve_ldhds_zsu,
+ gen_helper_sve_ldsds_zsu,
+ NULL, },
+ { gen_helper_sve_ldbdu_zsu,
+ gen_helper_sve_ldhdu_zsu,
+ gen_helper_sve_ldsdu_zsu,
+ gen_helper_sve_ldddu_zsu, } },
+ { { gen_helper_sve_ldbds_zss,
+ gen_helper_sve_ldhds_zss,
+ gen_helper_sve_ldsds_zss,
+ NULL, },
+ { gen_helper_sve_ldbdu_zss,
+ gen_helper_sve_ldhdu_zss,
+ gen_helper_sve_ldsdu_zss,
+ gen_helper_sve_ldddu_zss, } },
+ { { gen_helper_sve_ldbds_zd,
+ gen_helper_sve_ldhds_zd,
+ gen_helper_sve_ldsds_zd,
+ NULL, },
+ { gen_helper_sve_ldbdu_zd,
+ gen_helper_sve_ldhdu_zd,
+ gen_helper_sve_ldsdu_zd,
+ gen_helper_sve_ldddu_zd, } } },
+
+ { { { gen_helper_sve_ldffbds_zsu,
+ gen_helper_sve_ldffhds_zsu,
+ gen_helper_sve_ldffsds_zsu,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zsu,
+ gen_helper_sve_ldffhdu_zsu,
+ gen_helper_sve_ldffsdu_zsu,
+ gen_helper_sve_ldffddu_zsu, } },
+ { { gen_helper_sve_ldffbds_zss,
+ gen_helper_sve_ldffhds_zss,
+ gen_helper_sve_ldffsds_zss,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zss,
+ gen_helper_sve_ldffhdu_zss,
+ gen_helper_sve_ldffsdu_zss,
+ gen_helper_sve_ldffddu_zss, } },
+ { { gen_helper_sve_ldffbds_zd,
+ gen_helper_sve_ldffhds_zd,
+ gen_helper_sve_ldffsds_zd,
+ NULL, },
+ { gen_helper_sve_ldffbdu_zd,
+ gen_helper_sve_ldffhdu_zd,
+ gen_helper_sve_ldffsdu_zd,
+ gen_helper_sve_ldffddu_zd, } } }
+};
+
+static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a, uint32_t insn)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = gather_load_fn32[a->ff][a->xs][a->u][a->msz];
+ break;
+ case MO_64:
+ fn = gather_load_fn64[a->ff][a->xs][a->u][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+ cpu_reg_sp(s, a->rn), fn);
+ return true;
+}
+
+static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a, uint32_t insn)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ TCGv_i64 imm;
+
+ if (a->esz < a->msz || (a->esz == a->msz && !a->u)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = gather_load_fn32[a->ff][0][a->u][a->msz];
+ break;
+ case MO_64:
+ fn = gather_load_fn64[a->ff][2][a->u][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x])
+ * by loading the immediate into the scalar parameter.
+ */
+ imm = tcg_const_i64(a->imm << a->msz);
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn);
+ tcg_temp_free_i64(imm);
+ return true;
+}
+
+/* Indexed by [xs][msz]. */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][3] = {
+ { gen_helper_sve_stbs_zsu,
+ gen_helper_sve_sths_zsu,
+ gen_helper_sve_stss_zsu, },
+ { gen_helper_sve_stbs_zss,
+ gen_helper_sve_sths_zss,
+ gen_helper_sve_stss_zss, },
+};
+
+/* Note that we overload xs=2 to indicate 64-bit offset. */
+static gen_helper_gvec_mem_scatter * const scatter_store_fn64[3][4] = {
+ { gen_helper_sve_stbd_zsu,
+ gen_helper_sve_sthd_zsu,
+ gen_helper_sve_stsd_zsu,
+ gen_helper_sve_stdd_zsu, },
+ { gen_helper_sve_stbd_zss,
+ gen_helper_sve_sthd_zss,
+ gen_helper_sve_stsd_zss,
+ gen_helper_sve_stdd_zss, },
+ { gen_helper_sve_stbd_zd,
+ gen_helper_sve_sthd_zd,
+ gen_helper_sve_stsd_zd,
+ gen_helper_sve_stdd_zd, },
+};
+
+static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a, uint32_t insn)
+{
+ gen_helper_gvec_mem_scatter *fn;
+
+ if (a->esz < a->msz || (a->msz == 0 && a->scale)) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+ switch (a->esz) {
+ case MO_32:
+ fn = scatter_store_fn32[a->xs][a->msz];
+ break;
+ case MO_64:
+ fn = scatter_store_fn64[a->xs][a->msz];
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz,
+ cpu_reg_sp(s, a->rn), fn);
+ return true;
+}
+
+static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a, uint32_t insn)
+{
+ gen_helper_gvec_mem_scatter *fn = NULL;
+ TCGv_i64 imm;
+
+ if (a->esz < a->msz) {
+ return false;
+ }
+ if (!sve_access_check(s)) {
+ return true;
+ }
+
+ switch (a->esz) {
+ case MO_32:
+ fn = scatter_store_fn32[0][a->msz];
+ break;
+ case MO_64:
+ fn = scatter_store_fn64[2][a->msz];
+ break;
+ }
+ assert(fn != NULL);
+
+ /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x])
+ * by loading the immediate into the scalar parameter.
+ */
+ imm = tcg_const_i64(a->imm << a->msz);
+ do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn);
+ tcg_temp_free_i64(imm);
+ return true;
+}
+
+/*
+ * Prefetches
+ */
+
+static bool trans_PRF(DisasContext *s, arg_PRF *a, uint32_t insn)
+{
+ /* Prefetch is a nop within QEMU. */
+ sve_access_check(s);
+ return true;
+}
+
+static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a, uint32_t insn)
+{
+ if (a->rm == 31) {
+ return false;
+ }
+ /* Prefetch is a nop within QEMU. */
+ sve_access_check(s);
+ return true;
+}
+
+/*
+ * Move Prefix
+ *
+ * TODO: The implementation so far could handle predicated merging movprfx.
+ * The helper functions as written take an extra source register to
+ * use in the operation, but the result is only written when predication
+ * succeeds. For unpredicated movprfx, we need to rearrange the helpers
+ * to allow the final write back to the destination to be unconditional.
+ * For predicated zeroing movprfx, we need to rearrange the helpers to
+ * allow the final write back to zero inactives.
+ *
+ * In the meantime, just emit the moves.
+ */
+
+static bool trans_MOVPRFX(DisasContext *s, arg_MOVPRFX *a, uint32_t insn)
+{
+ return do_mov_z(s, a->rd, a->rn);
+}
+
+static bool trans_MOVPRFX_m(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ do_sel_z(s, a->rd, a->rn, a->rd, a->pg, a->esz);
+ }
+ return true;
+}
+
+static bool trans_MOVPRFX_z(DisasContext *s, arg_rpr_esz *a, uint32_t insn)
+{
+ if (sve_access_check(s)) {
+ do_movz_zpz(s, a->rd, a->rn, a->pg, a->esz);
+ }
+ return true;
+}
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 2a3e4f5d4c..f845da7c63 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -7762,9 +7762,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
*/
static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
{
- gen_helper_gvec_3_ptr *fn_gvec_ptr;
- int rd, rn, rm, rot, size, opr_sz;
- TCGv_ptr fpst;
+ gen_helper_gvec_3 *fn_gvec = NULL;
+ gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
+ int rd, rn, rm, opr_sz;
+ int data = 0;
bool q;
q = extract32(insn, 6, 1);
@@ -7777,8 +7778,8 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
if ((insn & 0xfe200f10) == 0xfc200800) {
/* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */
- size = extract32(insn, 20, 1);
- rot = extract32(insn, 23, 2);
+ int size = extract32(insn, 20, 1);
+ data = extract32(insn, 23, 2); /* rot */
if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
|| (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
return 1;
@@ -7786,13 +7787,20 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
fn_gvec_ptr = size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
} else if ((insn & 0xfea00f10) == 0xfc800800) {
/* VCADD -- 1111 110R 1.0S .... .... 1000 ...0 .... */
- size = extract32(insn, 20, 1);
- rot = extract32(insn, 24, 1);
+ int size = extract32(insn, 20, 1);
+ data = extract32(insn, 24, 1); /* rot */
if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
|| (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
return 1;
}
fn_gvec_ptr = size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
+ } else if ((insn & 0xfeb00f00) == 0xfc200d00) {
+ /* V[US]DOT -- 1111 1100 0.10 .... .... 1101 .Q.U .... */
+ bool u = extract32(insn, 4, 1);
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) {
+ return 1;
+ }
+ fn_gvec = u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
} else {
return 1;
}
@@ -7807,12 +7815,19 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
}
opr_sz = (1 + q) * 8;
- fpst = get_fpstatus_ptr(1);
- tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
- vfp_reg_offset(1, rn),
- vfp_reg_offset(1, rm), fpst,
- opr_sz, opr_sz, rot, fn_gvec_ptr);
- tcg_temp_free_ptr(fpst);
+ if (fn_gvec_ptr) {
+ TCGv_ptr fpst = get_fpstatus_ptr(1);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm), fpst,
+ opr_sz, opr_sz, data, fn_gvec_ptr);
+ tcg_temp_free_ptr(fpst);
+ } else {
+ tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm),
+ opr_sz, opr_sz, data, fn_gvec);
+ }
return 0;
}
@@ -7826,26 +7841,52 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
{
- int rd, rn, rm, rot, size, opr_sz;
- TCGv_ptr fpst;
+ gen_helper_gvec_3 *fn_gvec = NULL;
+ gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
+ int rd, rn, rm, opr_sz, data;
bool q;
q = extract32(insn, 6, 1);
VFP_DREG_D(rd, insn);
VFP_DREG_N(rn, insn);
- VFP_DREG_M(rm, insn);
if ((rd | rn) & q) {
return 1;
}
if ((insn & 0xff000f10) == 0xfe000800) {
/* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */
- rot = extract32(insn, 20, 2);
- size = extract32(insn, 23, 1);
- if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
- || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
+ int rot = extract32(insn, 20, 2);
+ int size = extract32(insn, 23, 1);
+ int index;
+
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)) {
+ return 1;
+ }
+ if (size == 0) {
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+ return 1;
+ }
+ /* For fp16, rm is just Vm, and index is M. */
+ rm = extract32(insn, 0, 4);
+ index = extract32(insn, 5, 1);
+ } else {
+ /* For fp32, rm is the usual M:Vm, and index is 0. */
+ VFP_DREG_M(rm, insn);
+ index = 0;
+ }
+ data = (index << 2) | rot;
+ fn_gvec_ptr = (size ? gen_helper_gvec_fcmlas_idx
+ : gen_helper_gvec_fcmlah_idx);
+ } else if ((insn & 0xffb00f00) == 0xfe200d00) {
+ /* V[US]DOT -- 1111 1110 0.10 .... .... 1101 .Q.U .... */
+ int u = extract32(insn, 4, 1);
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_DOTPROD)) {
return 1;
}
+ fn_gvec = u ? gen_helper_gvec_udot_idx_b : gen_helper_gvec_sdot_idx_b;
+ /* rm is just Vm, and index is M. */
+ data = extract32(insn, 5, 1); /* index */
+ rm = extract32(insn, 0, 4);
} else {
return 1;
}
@@ -7860,14 +7901,19 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
}
opr_sz = (1 + q) * 8;
- fpst = get_fpstatus_ptr(1);
- tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
- vfp_reg_offset(1, rn),
- vfp_reg_offset(1, rm), fpst,
- opr_sz, opr_sz, rot,
- size ? gen_helper_gvec_fcmlas_idx
- : gen_helper_gvec_fcmlah_idx);
- tcg_temp_free_ptr(fpst);
+ if (fn_gvec_ptr) {
+ TCGv_ptr fpst = get_fpstatus_ptr(1);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm), fpst,
+ opr_sz, opr_sz, data, fn_gvec_ptr);
+ tcg_temp_free_ptr(fpst);
+ } else {
+ tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm),
+ opr_sz, opr_sz, data, fn_gvec);
+ }
return 0;
}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index f504dd53c8..37f338732e 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -194,6 +194,197 @@ void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+/* Integer 8 and 16-bit dot-product.
+ *
+ * Note that for the loops herein, host endianness does not matter
+ * with respect to the ordering of data within the 64-bit lanes.
+ * All elements are treated equally, no matter where they are.
+ */
+
+void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd;
+ int8_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] += n[i * 4 + 0] * m[i * 4 + 0]
+ + n[i * 4 + 1] * m[i * 4 + 1]
+ + n[i * 4 + 2] * m[i * 4 + 2]
+ + n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd;
+ uint8_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] += n[i * 4 + 0] * m[i * 4 + 0]
+ + n[i * 4 + 1] * m[i * 4 + 1]
+ + n[i * 4 + 2] * m[i * 4 + 2]
+ + n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd;
+ int16_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0]
+ + (int64_t)n[i * 4 + 1] * m[i * 4 + 1]
+ + (int64_t)n[i * 4 + 2] * m[i * 4 + 2]
+ + (int64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd;
+ uint16_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0]
+ + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1]
+ + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2]
+ + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
+ intptr_t index = simd_data(desc);
+ uint32_t *d = vd;
+ int8_t *n = vn;
+ int8_t *m_indexed = (int8_t *)vm + index * 4;
+
+ /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
+ * Otherwise opr_sz is a multiple of 16.
+ */
+ segend = MIN(4, opr_sz_4);
+ i = 0;
+ do {
+ int8_t m0 = m_indexed[i * 4 + 0];
+ int8_t m1 = m_indexed[i * 4 + 1];
+ int8_t m2 = m_indexed[i * 4 + 2];
+ int8_t m3 = m_indexed[i * 4 + 3];
+
+ do {
+ d[i] += n[i * 4 + 0] * m0
+ + n[i * 4 + 1] * m1
+ + n[i * 4 + 2] * m2
+ + n[i * 4 + 3] * m3;
+ } while (++i < segend);
+ segend = i + 4;
+ } while (i < opr_sz_4);
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
+ intptr_t index = simd_data(desc);
+ uint32_t *d = vd;
+ uint8_t *n = vn;
+ uint8_t *m_indexed = (uint8_t *)vm + index * 4;
+
+ /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
+ * Otherwise opr_sz is a multiple of 16.
+ */
+ segend = MIN(4, opr_sz_4);
+ i = 0;
+ do {
+ uint8_t m0 = m_indexed[i * 4 + 0];
+ uint8_t m1 = m_indexed[i * 4 + 1];
+ uint8_t m2 = m_indexed[i * 4 + 2];
+ uint8_t m3 = m_indexed[i * 4 + 3];
+
+ do {
+ d[i] += n[i * 4 + 0] * m0
+ + n[i * 4 + 1] * m1
+ + n[i * 4 + 2] * m2
+ + n[i * 4 + 3] * m3;
+ } while (++i < segend);
+ segend = i + 4;
+ } while (i < opr_sz_4);
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
+ intptr_t index = simd_data(desc);
+ uint64_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m_indexed = (int16_t *)vm + index * 4;
+
+ /* This is supported by SVE only, so opr_sz is always a multiple of 16.
+ * Process the entire segment all at once, writing back the results
+ * only after we've consumed all of the inputs.
+ */
+ for (i = 0; i < opr_sz_8 ; i += 2) {
+ uint64_t d0, d1;
+
+ d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
+ d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
+ d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
+ d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
+ d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
+ d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
+ d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
+ d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
+
+ d[i + 0] += d0;
+ d[i + 1] += d1;
+ }
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
+ intptr_t index = simd_data(desc);
+ uint64_t *d = vd;
+ uint16_t *n = vn;
+ uint16_t *m_indexed = (uint16_t *)vm + index * 4;
+
+ /* This is supported by SVE only, so opr_sz is always a multiple of 16.
+ * Process the entire segment all at once, writing back the results
+ * only after we've consumed all of the inputs.
+ */
+ for (i = 0; i < opr_sz_8 ; i += 2) {
+ uint64_t d0, d1;
+
+ d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
+ d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1];
+ d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2];
+ d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3];
+ d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
+ d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1];
+ d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2];
+ d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3];
+
+ d[i + 0] += d0;
+ d[i + 1] += d1;
+ }
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
void *vfpst, uint32_t desc)
{
@@ -317,23 +508,29 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
float_status *fpst = vfpst;
intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
- float16 e1 = m[H2(flip)];
- float16 e3 = m[H2(1 - flip)];
+ intptr_t elements = opr_sz / sizeof(float16);
+ intptr_t eltspersegment = 16 / sizeof(float16);
+ intptr_t i, j;
/* Shift boolean to the sign bit so we can xor to negate. */
neg_real <<= 15;
neg_imag <<= 15;
- e1 ^= neg_real;
- e3 ^= neg_imag;
- for (i = 0; i < opr_sz / 2; i += 2) {
- float16 e2 = n[H2(i + flip)];
- float16 e4 = e2;
+ for (i = 0; i < elements; i += eltspersegment) {
+ float16 mr = m[H2(i + 2 * index + 0)];
+ float16 mi = m[H2(i + 2 * index + 1)];
+ float16 e1 = neg_real ^ (flip ? mi : mr);
+ float16 e3 = neg_imag ^ (flip ? mr : mi);
- d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
- d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float16 e2 = n[H2(j + flip)];
+ float16 e4 = e2;
+
+ d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst);
+ d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst);
+ }
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
@@ -377,23 +574,29 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
float_status *fpst = vfpst;
intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
- float32 e1 = m[H4(flip)];
- float32 e3 = m[H4(1 - flip)];
+ intptr_t elements = opr_sz / sizeof(float32);
+ intptr_t eltspersegment = 16 / sizeof(float32);
+ intptr_t i, j;
/* Shift boolean to the sign bit so we can xor to negate. */
neg_real <<= 31;
neg_imag <<= 31;
- e1 ^= neg_real;
- e3 ^= neg_imag;
- for (i = 0; i < opr_sz / 4; i += 2) {
- float32 e2 = n[H4(i + flip)];
- float32 e4 = e2;
+ for (i = 0; i < elements; i += eltspersegment) {
+ float32 mr = m[H4(i + 2 * index + 0)];
+ float32 mi = m[H4(i + 2 * index + 1)];
+ float32 e1 = neg_real ^ (flip ? mi : mr);
+ float32 e3 = neg_imag ^ (flip ? mr : mi);
- d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
- d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float32 e2 = n[H4(j + flip)];
+ float32 e4 = e2;
+
+ d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst);
+ d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst);
+ }
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
@@ -427,6 +630,26 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+#define DO_2OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], stat); \
+ } \
+}
+
+DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
+DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
+DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
+
+DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
+DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
+DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+
+#undef DO_2OP
+
/* Floating-point trigonometric starting value.
* See the ARM ARM pseudocode function FPTrigSMul.
*/
@@ -495,3 +718,51 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
#endif
#undef DO_3OP
+
+/* For the indexed ops, SVE applies the index per 128-bit vector segment.
+ * For AdvSIMD, there is of course only one such vector segment.
+ */
+
+#define DO_MUL_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t idx = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_mul(n[i + j], mm, stat); \
+ } \
+ } \
+}
+
+DO_MUL_IDX(gvec_fmul_idx_h, float16, H2)
+DO_MUL_IDX(gvec_fmul_idx_s, float32, H4)
+DO_MUL_IDX(gvec_fmul_idx_d, float64, )
+
+#undef DO_MUL_IDX
+
+#define DO_FMLA_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
+ void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ op1_neg <<= (8 * sizeof(TYPE) - 1); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
+ mm, a[i + j], 0, stat); \
+ } \
+ } \
+}
+
+DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
+DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
+DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
+
+#undef DO_FMLA_IDX