Common: Fix a bunch of errors in ARM vector wrapper

ARM64 passes now at least.
This commit is contained in:
Stenzek
2025-12-14 19:20:19 +10:00
parent 3fc563e5a3
commit 351e787681
2 changed files with 59 additions and 40 deletions

View File

@@ -194,24 +194,12 @@ TEST(GSVector2iTest, UnpackOperations)
EXPECT_EQ(upl8_result.U8[5], 0);
EXPECT_EQ(upl8_result.U8[6], 0);
EXPECT_EQ(upl8_result.U8[7], 0);
EXPECT_EQ(upl8_result.U8[8], 0x56);
EXPECT_EQ(upl8_result.U8[9], 0);
EXPECT_EQ(upl8_result.U8[10], 0);
EXPECT_EQ(upl8_result.U8[11], 0);
EXPECT_EQ(upl8_result.U8[12], 0x78);
EXPECT_EQ(upl8_result.U8[13], 0);
EXPECT_EQ(upl8_result.U8[14], 0);
EXPECT_EQ(upl8_result.U8[15], 0);
auto upl16_result = v1.upl16();
EXPECT_EQ(upl16_result.U16[0], 0x12);
EXPECT_EQ(upl16_result.U16[1], 0);
EXPECT_EQ(upl16_result.U16[2], 0x34);
EXPECT_EQ(upl16_result.U16[3], 0);
EXPECT_EQ(upl16_result.U16[4], 0x56);
EXPECT_EQ(upl16_result.U16[5], 0);
EXPECT_EQ(upl16_result.U16[6], 0x78);
EXPECT_EQ(upl16_result.U16[7], 0);
}
TEST(GSVector2iTest, TypeConversions)
@@ -806,20 +794,28 @@ TEST(GSVector4iTest, Shift64BitOperations)
#ifdef GSVECTOR_HAS_SRLV
TEST(GSVector4iTest, VariableShifts)
{
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000);
GSVector4i shift_amounts(1, 2, 3, 4);
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000, 0x1000, 0x2000, 0x4000, 0x8000);
GSVector4i shift_amounts(1, 2, 3, 4, 1, 2, 3, 4);
auto sllv16_result = v1.sllv16(shift_amounts);
EXPECT_EQ(sllv16_result.U16[0], 0x2000); // 0x1000 << 1
EXPECT_EQ(sllv16_result.U16[1], 0x8000); // 0x2000 << 2
EXPECT_EQ(sllv16_result.U16[2], 0x0000); // 0x4000 << 3 (overflow)
EXPECT_EQ(sllv16_result.U16[3], 0x0000); // 0x8000 << 4 (overflow)
EXPECT_EQ(sllv16_result.U16[4], 0x2000); // 0x1000 << 1
EXPECT_EQ(sllv16_result.U16[5], 0x8000); // 0x2000 << 2
EXPECT_EQ(sllv16_result.U16[6], 0x0000); // 0x4000 << 3 (overflow)
EXPECT_EQ(sllv16_result.U16[7], 0x0000); // 0x8000 << 4 (overflow)
auto srlv16_result = v1.srlv16(shift_amounts);
EXPECT_EQ(srlv16_result.U16[0], 0x0800); // 0x1000 >> 1
EXPECT_EQ(srlv16_result.U16[1], 0x0800); // 0x2000 >> 2
EXPECT_EQ(srlv16_result.U16[2], 0x0800); // 0x4000 >> 3
EXPECT_EQ(srlv16_result.U16[3], 0x0800); // 0x8000 >> 4
EXPECT_EQ(srlv16_result.U16[4], 0x0800); // 0x1000 >> 1
EXPECT_EQ(srlv16_result.U16[5], 0x0800); // 0x2000 >> 2
EXPECT_EQ(srlv16_result.U16[6], 0x0800); // 0x4000 >> 3
EXPECT_EQ(srlv16_result.U16[7], 0x0800); // 0x8000 >> 4
}
#endif
@@ -1512,4 +1508,4 @@ TEST(GSVectorTest, Runion_IsCommutative)
GSVector4 result2 = rect2.runion(rect1);
EXPECT_TRUE(result1.eq(result2));
}
}

View File

@@ -401,23 +401,23 @@ public:
template<int i>
ALWAYS_INLINE GSVector2i sll() const
{
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 8 - i)));
}
template<int i>
ALWAYS_INLINE GSVector2i sll16() const
{
return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
return GSVector2i(vreinterpret_s32_u16(vshl_n_u16(vreinterpret_u16_s32(v2s), i)));
}
ALWAYS_INLINE GSVector2i sll16(s32 i) const
{
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_s16(i))));
}
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
}
template<int i>
@@ -459,9 +459,15 @@ public:
return GSVector2i(vshl_n_s32(v2s, i));
}
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
ALWAYS_INLINE GSVector2i sll32(s32 i) const
{
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(i))));
}
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), v.v2s)));
}
template<int i>
ALWAYS_INLINE GSVector2i srl32() const
@@ -553,6 +559,16 @@ public:
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
}
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
}
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
}
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
{
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
@@ -707,7 +723,7 @@ public:
return GSVector2i(vset_lane_s32(val, vdup_n_s32(0), 0));
}
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
@@ -886,7 +902,7 @@ public:
template<int mask>
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
{
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 2 : 0, (mask & 2) ? 3 : 1));
}
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
@@ -1324,15 +1340,16 @@ public:
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
{
#ifdef CPU_ARCH_ARM64
const int32x4_t acc =
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
const int32x4_t low =
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
const int32x4_t high = vmull_high_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
return GSVector4i(vpaddq_s32(low, high));
#else
// borrowed from sse2neon
const int32x4_t low =
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
const int32x4_t high =
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
#endif
@@ -1756,17 +1773,17 @@ public:
template<int i>
ALWAYS_INLINE GSVector4i sll16() const
{
return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
return GSVector4i(vreinterpretq_s32_u16(vshlq_n_u16(vreinterpretq_u16_s32(v4s), i)));
}
ALWAYS_INLINE GSVector4i sll16(s32 i) const
{
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_s16(i))));
}
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
}
template<int i>
@@ -1783,7 +1800,7 @@ public:
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
{
return GSVector4i(
vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
}
template<int i>
@@ -1810,9 +1827,15 @@ public:
return GSVector4i(vshlq_n_s32(v4s, i));
}
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
ALWAYS_INLINE GSVector4i sll32(s32 i) const
{
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(i))));
}
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), v.v4s)));
}
template<int i>
ALWAYS_INLINE GSVector4i srl32() const
@@ -1843,17 +1866,17 @@ public:
template<int i>
ALWAYS_INLINE GSVector4i sll64() const
{
return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
return GSVector4i(vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(v4s), i)));
}
ALWAYS_INLINE GSVector4i sll64(s32 i) const
{
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s64(i))));
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_s64(i))));
}
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
{
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
}
template<int i>
@@ -2771,7 +2794,7 @@ public:
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
{
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
return GSVector4(vcombine_f32(vget_high_f32(a.v4s), vget_high_f32(v4s)));
}
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
@@ -3163,7 +3186,7 @@ public:
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_u64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
return GSVector4(vreinterpretq_f32_u64(vcltq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
@@ -3230,7 +3253,7 @@ public:
ALWAYS_INLINE GSVector4 sqr64() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v4s))));
#else
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
#endif