1 سال پیش · 9cc3e7d32d
--- a/AK/SIMDExtras.h
+++ b/AK/SIMDExtras.h
@@ -220,13 +220,14 @@ ALWAYS_INLINE static T shuffle_or_0_impl(T a, Control control, IndexSequence<Idx
 
				     if constexpr (__has_builtin(__builtin_shuffle)) {
			
 
				         // GCC does a very bad job at optimizing the masking, while not recognizing the shuffle idiom
			
 
				         // So we jinx its __builtin_shuffle to work with out of bounds indices
			
 
				+        // TODO: verify that this masking logic is correct (for machines with __builtin_shuffle)
			
 
				         auto mask = (control >= 0) | (control < N);
			
 
				         return __builtin_shuffle(a, control & mask) & ~mask;
			
 
				     }
			
 
				     // 1. Set all out of bounds values to ~0
			
 
				     // Note: This is done so that  the optimization mentioned down below works
			
 
				     // Note: Vector compares result in bitmasks, aka all 1s or all 0s per element
			
 
				-    control |= ~((control > 0) | (control < N));
			
 
				+    control |= ~((control >= 0) & (control < N));
			
 
				     // 2. Selectively set out of bounds values to 0
			
 
				     // Note: Clang successfully optimizes this to a few instructions on x86-ssse3, GCC does not
			
 
				     //       Vector Optimizations/Instruction-Selection on ArmV8 seem to not be as powerful as of Clang18
			
--- a/Userland/Libraries/LibWasm/AbstractMachine/BytecodeInterpreter.cpp
+++ b/Userland/Libraries/LibWasm/AbstractMachine/BytecodeInterpreter.cpp
@@ -1289,12 +1289,17 @@ void BytecodeInterpreter::interpret(Configuration& configuration, InstructionPoi
 
				     case Instructions::f64x2_splat.value():
			
 
				         return pop_and_push_m_splat<64, NativeFloatingType>(configuration, instruction);
			
 
				     case Instructions::i8x16_shuffle.value(): {
			
 
				-        auto indices = pop_vector<u8, MakeSigned>(configuration);
			
 
				-        TRAP_IF_NOT(indices.has_value());
			
 
				-        auto vector = peek_vector<u8, MakeSigned>(configuration);
			
 
				-        TRAP_IF_NOT(vector.has_value());
			
 
				-        auto result = shuffle_vector(vector.value(), indices.value());
			
 
				-        configuration.stack().peek() = Value(result);
			
 
				+        auto& arg = instruction.arguments().get<Instruction::ShuffleArgument>();
			
 
				+        auto b = *pop_vector<u8, MakeUnsigned>(configuration);
			
 
				+        auto a = *pop_vector<u8, MakeUnsigned>(configuration);
			
 
				+        using VectorType = Native128ByteVectorOf<u8, MakeUnsigned>;
			
 
				+        VectorType result;
			
 
				+        for (size_t i = 0; i < 16; ++i)
			
 
				+            if (arg.lanes[i] < 16)
			
 
				+                result[i] = a[arg.lanes[i]];
			
 
				+            else
			
 
				+                result[i] = b[arg.lanes[i] - 16];
			
 
				+        configuration.stack().push(Value(bit_cast<u128>(result)));
			
 
				         return;
			
 
				     }
			
 
				     case Instructions::v128_store.value():
			
--- a/Userland/Libraries/LibWasm/AbstractMachine/Operators.h
+++ b/Userland/Libraries/LibWasm/AbstractMachine/Operators.h
@@ -239,8 +239,8 @@ struct VectorSwizzle {
 
				     auto operator()(u128 c1, u128 c2) const
			
 
				     {
			
 
				         // https://webassembly.github.io/spec/core/bikeshed/#-mathsfi8x16hrefsyntax-instr-vecmathsfswizzle%E2%91%A0
			
 
				-        auto i = bit_cast<Native128ByteVectorOf<i8, MakeSigned>>(c2);
			
 
				-        auto j = bit_cast<Native128ByteVectorOf<i8, MakeSigned>>(c1);
			
 
				+        auto i = bit_cast<Native128ByteVectorOf<i8, MakeSigned>>(c1);
			
 
				+        auto j = bit_cast<Native128ByteVectorOf<i8, MakeSigned>>(c2);
			
 
				         auto result = shuffle_or_0(i, j);
			
 
				         return bit_cast<u128>(result);
			
 
				     }