diff options
author | Robert Hoo <robert.hu@linux.intel.com> | 2020-03-25 14:50:21 +0800 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2020-04-01 14:24:03 -0400 |
commit | 8f13a39dc02ea8a3e923102a8444185630c635ea (patch) | |
tree | 21825595ebc0e81e8e102033b0c12031bfb84983 | |
parent | b87c99d0731fa30f1f455b211cbcf385b0fe427c (diff) | |
download | qemu-8f13a39dc02ea8a3e923102a8444185630c635ea.zip |
util/bufferiszero: improve avx2 accelerator
By increasing avx2 length_to_accel to 128, we can simplify its logic and reduce a
branch.
The authorship of this patch actually belongs to Richard Henderson
<richard.henderson@linaro.org>, I just fixed a boundary case on his
original patch.
Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Robert Hoo <robert.hu@linux.intel.com>
Message-Id: <1585119021-46593-2-git-send-email-robert.hu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r-- | util/bufferiszero.c | 26 |
1 files changed, 9 insertions, 17 deletions
diff --git a/util/bufferiszero.c b/util/bufferiszero.c index b8012532e4..695bb4ce28 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -158,27 +158,19 @@ buffer_zero_avx2(const void *buf, size_t len) __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); - if (likely(p <= e)) { - /* Loop over 32-byte aligned blocks of 128. */ - do { - __builtin_prefetch(p); - if (unlikely(!_mm256_testz_si256(t, t))) { - return false; - } - t = p[-4] | p[-3] | p[-2] | p[-1]; - p += 4; - } while (p <= e); - } else { - t |= _mm256_loadu_si256(buf + 32); - if (len <= 128) { - goto last2; + /* Loop over 32-byte aligned blocks of 128. */ + while (p <= e) { + __builtin_prefetch(p); + if (unlikely(!_mm256_testz_si256(t, t))) { + return false; } - } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } ; /* Finish the last block of 128 unaligned. */ t |= _mm256_loadu_si256(buf + len - 4 * 32); t |= _mm256_loadu_si256(buf + len - 3 * 32); - last2: t |= _mm256_loadu_si256(buf + len - 2 * 32); t |= _mm256_loadu_si256(buf + len - 1 * 32); @@ -263,7 +255,7 @@ static void init_accel(unsigned cache) } if (cache & CACHE_AVX2) { fn = buffer_zero_avx2; - length_to_accel = 64; + length_to_accel = 128; } #endif #ifdef CONFIG_AVX512F_OPT |