Turns out the simde implementation of movemask is not slow enough to compensate for the speed bump from 256 bit

This commit is contained in:
Kovid Goyal
2024-01-27 12:09:16 +05:30
parent 0bd47d8457
commit 8aa1b112b8

View File

@@ -83,12 +83,6 @@ typedef int32_t find_mask_t;
static inline int
FUNC(is_zero)(const integer_t a) { return simde_mm_testz_si128(a, a); }
static inline find_mask_t
mask_for_find(const integer_t a) { return movemask_epi8(a); }
static inline unsigned
bytes_to_first_match(const find_mask_t m) { return __builtin_ctz(m); }
#else
#define set1_epi8(x) simde_mm256_set1_epi8((char)(x))
@@ -116,12 +110,6 @@ bytes_to_first_match(const find_mask_t m) { return __builtin_ctz(m); }
static inline int
FUNC(is_zero)(const integer_t a) { return simde_mm256_testz_si256(a, a); }
static inline find_mask_t
mask_for_find(const integer_t a) { return movemask_epi8(a); }
static inline unsigned
bytes_to_first_match(const find_mask_t m) { return __builtin_ctz(m); }
static inline integer_t
shift_right_by_one_byte(const integer_t A) {
return simde_mm256_alignr_epi8(A, simde_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), 16 - 1);
@@ -214,6 +202,13 @@ static inline integer_t shuffle_impl256(const integer_t value, const integer_t s
#define debug(...)
#endif
static inline find_mask_t
mask_for_find(const integer_t a) { return movemask_epi8(a); }
static inline unsigned
bytes_to_first_match(const find_mask_t m) { return __builtin_ctz(m); }
// }}}
static inline integer_t