AK/SIMDExtras: Fix masking logic in shuffle_or_0

This commit is contained in:
Diego Frias 2024-07-26 15:19:36 -07:00 committed by Ali Mohammad Pur
parent 48f1861ce9
commit a168bec7ef
Notes: github-actions[bot] 2024-07-27 13:03:24 +00:00

View file

@ -218,11 +218,10 @@ ALWAYS_INLINE static T shuffle_or_0_impl(T a, Control control, IndexSequence<Idx
using E = ElementOf<T>;
if constexpr (__has_builtin(__builtin_shuffle)) {
// GCC does a very bad job at optimizing the masking, while not recognizing the shuffle idiom
// So we jinx its __builtin_shuffle to work with out of bounds indices
// TODO: verify that this masking logic is correct (for machines with __builtin_shuffle)
auto mask = (control >= 0) | (control < N);
return __builtin_shuffle(a, control & mask) & ~mask;
auto vector = __builtin_shuffle(a, control);
for (size_t i = 0; i < N; ++i)
vector[i] = control[i] < 0 || control[i] >= N ? 0 : vector[i];
return vector;
}
// 1. Set all out of bounds values to ~0
// Note: This is done so that the optimization mentioned down below works