7 files changed, 92 insertions, 9 deletions
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 3ed03391a8..87235dae63 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3441,7 +3441,7 @@ typedef ARMCPU ArchCPU;
  * | TBFLAG_AM32 |          +-----+----------+
  * |             |                |TBFLAG_M32|
  * +-------------+----------------+----------+
- *  31         23                5 4        0
+ *  31         23                6 5        0
  *
  * Unless otherwise noted, these bits are cached in env->hflags.
  */
@@ -3499,6 +3499,8 @@ FIELD(TBFLAG_M32, LSPACT, 2, 1)                 /* Not cached. */
 FIELD(TBFLAG_M32, NEW_FP_CTXT_NEEDED, 3, 1)     /* Not cached. */
 /* Set if FPCCR.S does not match current security state */
 FIELD(TBFLAG_M32, FPCCR_S_WRONG, 4, 1)          /* Not cached. */
+/* Set if MVE insns are definitely not predicated by VPR or LTPSIZE */
+FIELD(TBFLAG_M32, MVE_NO_PRED, 5, 1)            /* Not cached. */
 
 /*
  * Bit usage when in AArch64 state
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 21ee94ec2e..6274221447 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -13637,6 +13637,35 @@ static inline void assert_hflags_rebuild_correctly(CPUARMState *env)
 #endif
 }
 
+static bool mve_no_pred(CPUARMState *env)
+{
+    /*
+     * Return true if there is definitely no predication of MVE
+     * instructions by VPR or LTPSIZE. (Returning false even if there
+     * isn't any predication is OK; generated code will just be
+     * a little worse.)
+     * If the CPU does not implement MVE then this TB flag is always 0.
+     *
+     * NOTE: if you change this logic, the "recalculate s->mve_no_pred"
+     * logic in gen_update_fp_context() needs to be updated to match.
+     *
+     * We do not include the effect of the ECI bits here -- they are
+     * tracked in other TB flags. This simplifies the logic for
+     * "when did we emit code that changes the MVE_NO_PRED TB flag
+     * and thus need to end the TB?".
+     */
+    if (cpu_isar_feature(aa32_mve, env_archcpu(env))) {
+        return false;
+    }
+    if (env->v7m.vpr) {
+        return false;
+    }
+    if (env->v7m.ltpsize < 4) {
+        return false;
+    }
+    return true;
+}
+
 void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                           target_ulong *cs_base, uint32_t *pflags)
 {
@@ -13676,6 +13705,10 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
             if (env->v7m.fpccr[is_secure] & R_V7M_FPCCR_LSPACT_MASK) {
                 DP_TBFLAG_M32(flags, LSPACT, 1);
             }
+
+            if (mve_no_pred(env)) {
+                DP_TBFLAG_M32(flags, MVE_NO_PRED, 1);
+            }
         } else {
             /*
              * Note that XSCALE_CPAR shares bits with VECSTRIDE.
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
index 5eab04832c..d9e144e8eb 100644
--- a/target/arm/translate-m-nocp.c
+++ b/target/arm/translate-m-nocp.c
@@ -95,7 +95,10 @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
 
     clear_eci_state(s);
 
-    /* End the TB, because we have updated FP control bits */
+    /*
+     * End the TB, because we have updated FP control bits,
+     * and possibly VPR or LTPSIZE.
+     */
     s->base.is_jmp = DISAS_UPDATE_EXIT;
     return true;
 }
@@ -397,6 +400,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
         store_cpu_field(control, v7m.control[M_REG_S]);
         tcg_gen_andi_i32(tmp, tmp, ~FPCR_NZCV_MASK);
         gen_helper_vfp_set_fpscr(cpu_env, tmp);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
         tcg_temp_free_i32(tmp);
         tcg_temp_free_i32(sfpa);
         break;
@@ -409,6 +413,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
         }
         tmp = loadfn(s, opaque, true);
         store_cpu_field(tmp, v7m.vpr);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
         break;
     case ARM_VFP_P0:
     {
@@ -418,6 +423,7 @@ static bool gen_M_fp_sysreg_write(DisasContext *s, int regno,
         tcg_gen_deposit_i32(vpr, vpr, tmp,
                             R_V7M_VPR_P0_SHIFT, R_V7M_VPR_P0_LENGTH);
         store_cpu_field(vpr, v7m.vpr);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
         tcg_temp_free_i32(tmp);
         break;
     }
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index 2ed91577ec..0eca96e29c 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -810,7 +810,12 @@ DO_LOGIC(VORR, gen_helper_mve_vorr)
 DO_LOGIC(VORN, gen_helper_mve_vorn)
 DO_LOGIC(VEOR, gen_helper_mve_veor)
 
-DO_LOGIC(VPSEL, gen_helper_mve_vpsel)
+static bool trans_VPSEL(DisasContext *s, arg_2op *a)
+{
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+    return do_2op(s, a, gen_helper_mve_vpsel);
+}
 
 #define DO_2OP(INSN, FN) \
     static bool trans_##INSN(DisasContext *s, arg_2op *a)       \
@@ -1366,6 +1371,8 @@ static bool trans_VPNOT(DisasContext *s, arg_VPNOT *a)
     }
 
     gen_helper_mve_vpnot(cpu_env);
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
     mve_update_eci(s);
     return true;
 }
@@ -1852,6 +1859,8 @@ static bool do_vcmp(DisasContext *s, arg_vcmp *a, MVEGenCmpFn *fn)
         /* VPT */
         gen_vpst(s, a->mask);
     }
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
     mve_update_eci(s);
     return true;
 }
@@ -1883,6 +1892,8 @@ static bool do_vcmp_scalar(DisasContext *s, arg_vcmp_scalar *a,
         /* VPT */
         gen_vpst(s, a->mask);
     }
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
     mve_update_eci(s);
     return true;
 }
diff --git a/target/arm/translate-vfp.c b/target/arm/translate-vfp.c
index e2eb797c82..59bcaec5be 100644
--- a/target/arm/translate-vfp.c
+++ b/target/arm/translate-vfp.c
@@ -109,7 +109,7 @@ static inline long vfp_f16_offset(unsigned reg, bool top)
  * Generate code for M-profile lazy FP state preservation if needed;
  * this corresponds to the pseudocode PreserveFPState() function.
  */
-static void gen_preserve_fp_state(DisasContext *s)
+static void gen_preserve_fp_state(DisasContext *s, bool skip_context_update)
 {
     if (s->v7m_lspact) {
         /*
@@ -128,6 +128,20 @@ static void gen_preserve_fp_state(DisasContext *s)
          * any further FP insns in this TB.
          */
         s->v7m_lspact = false;
+        /*
+         * The helper might have zeroed VPR, so we do not know the
+         * correct value for the MVE_NO_PRED TB flag any more.
+         * If we're about to create a new fp context then that
+         * will precisely determine the MVE_NO_PRED value (see
+         * gen_update_fp_context()). Otherwise, we must:
+         *  - set s->mve_no_pred to false, so this instruction
+         *    is generated to use helper functions
+         *  - end the TB now, without chaining to the next TB
+         */
+        if (skip_context_update || !s->v7m_new_fp_ctxt_needed) {
+            s->mve_no_pred = false;
+            s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
+        }
     }
 }
 
@@ -169,12 +183,19 @@ static void gen_update_fp_context(DisasContext *s)
             TCGv_i32 z32 = tcg_const_i32(0);
             store_cpu_field(z32, v7m.vpr);
         }
-
         /*
-         * We don't need to arrange to end the TB, because the only
-         * parts of FPSCR which we cache in the TB flags are the VECLEN
-         * and VECSTRIDE, and those don't exist for M-profile.
+         * We just updated the FPSCR and VPR. Some of this state is cached
+         * in the MVE_NO_PRED TB flag. We want to avoid having to end the
+         * TB here, which means we need the new value of the MVE_NO_PRED
+         * flag to be exactly known here and the same for all executions.
+         * Luckily FPDSCR.LTPSIZE is always constant 4 and the VPR is
+         * always set to 0, so the new MVE_NO_PRED flag is always 1
+         * if and only if we have MVE.
+         *
+         * (The other FPSCR state cached in TB flags is VECLEN and VECSTRIDE,
+         * but those do not exist for M-profile, so are not relevant here.)
          */
+        s->mve_no_pred = dc_isar_feature(aa32_mve, s);
 
         if (s->v8m_secure) {
             bits |= R_V7M_CONTROL_SFPA_MASK;
@@ -238,7 +259,7 @@ bool vfp_access_check_m(DisasContext *s, bool skip_context_update)
     /* Handle M-profile lazy FP state mechanics */
 
     /* Trigger lazy-state preservation if necessary */
-    gen_preserve_fp_state(s);
+    gen_preserve_fp_state(s, skip_context_update);
 
     if (!skip_context_update) {
         /* Update ownership of FP context and create new FP context if needed */
diff --git a/target/arm/translate.c b/target/arm/translate.c
index cf31e0237d..f7086c66a5 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -8496,6 +8496,7 @@ static bool trans_DLS(DisasContext *s, arg_DLS *a)
         /* DLSTP: set FPSCR.LTPSIZE */
         tmp = tcg_const_i32(a->size);
         store_cpu_field(tmp, v7m.ltpsize);
+        s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
     }
     return true;
 }
@@ -8561,6 +8562,10 @@ static bool trans_WLS(DisasContext *s, arg_WLS *a)
         assert(ok);
         tmp = tcg_const_i32(a->size);
         store_cpu_field(tmp, v7m.ltpsize);
+        /*
+         * LTPSIZE updated, but MVE_NO_PRED will always be the same thing (0)
+         * when we take this upcoming exit from this TB, so gen_jmp_tb() is OK.
+         */
     }
     gen_jmp_tb(s, s->base.pc_next, 1);
 
@@ -8743,6 +8748,8 @@ static bool trans_VCTP(DisasContext *s, arg_VCTP *a)
     gen_helper_mve_vctp(cpu_env, masklen);
     tcg_temp_free_i32(masklen);
     tcg_temp_free_i32(rn_shifted);
+    /* This insn updates predication bits */
+    s->base.is_jmp = DISAS_UPDATE_NOCHAIN;
     mve_update_eci(s);
     return true;
 }
@@ -9413,6 +9420,7 @@ static void arm_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cs)
         dc->v7m_new_fp_ctxt_needed =
             EX_TBFLAG_M32(tb_flags, NEW_FP_CTXT_NEEDED);
         dc->v7m_lspact = EX_TBFLAG_M32(tb_flags, LSPACT);
+        dc->mve_no_pred = EX_TBFLAG_M32(tb_flags, MVE_NO_PRED);
     } else {
         dc->debug_target_el = EX_TBFLAG_ANY(tb_flags, DEBUG_TARGET_EL);
         dc->sctlr_b = EX_TBFLAG_A32(tb_flags, SCTLR__B);
diff --git a/target/arm/translate.h b/target/arm/translate.h
index 605d1f2e33..3a0db801d3 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -100,6 +100,8 @@ typedef struct DisasContext {
     bool align_mem;
     /* True if PSTATE.IL is set */
     bool pstate_il;
+    /* True if MVE insns are definitely not predicated by VPR or LTPSIZE */
+    bool mve_no_pred;
     /*
      * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI.
      *  < 0, set by the current instruction.