mirror of
https://github.com/stenzek/duckstation.git
synced 2026-02-15 02:44:41 +00:00
Common: Fix a bunch of errors in ARM vector wrapper
ARM64 passes now at least.
This commit is contained in:
@@ -194,24 +194,12 @@ TEST(GSVector2iTest, UnpackOperations)
|
||||
EXPECT_EQ(upl8_result.U8[5], 0);
|
||||
EXPECT_EQ(upl8_result.U8[6], 0);
|
||||
EXPECT_EQ(upl8_result.U8[7], 0);
|
||||
EXPECT_EQ(upl8_result.U8[8], 0x56);
|
||||
EXPECT_EQ(upl8_result.U8[9], 0);
|
||||
EXPECT_EQ(upl8_result.U8[10], 0);
|
||||
EXPECT_EQ(upl8_result.U8[11], 0);
|
||||
EXPECT_EQ(upl8_result.U8[12], 0x78);
|
||||
EXPECT_EQ(upl8_result.U8[13], 0);
|
||||
EXPECT_EQ(upl8_result.U8[14], 0);
|
||||
EXPECT_EQ(upl8_result.U8[15], 0);
|
||||
|
||||
auto upl16_result = v1.upl16();
|
||||
EXPECT_EQ(upl16_result.U16[0], 0x12);
|
||||
EXPECT_EQ(upl16_result.U16[1], 0);
|
||||
EXPECT_EQ(upl16_result.U16[2], 0x34);
|
||||
EXPECT_EQ(upl16_result.U16[3], 0);
|
||||
EXPECT_EQ(upl16_result.U16[4], 0x56);
|
||||
EXPECT_EQ(upl16_result.U16[5], 0);
|
||||
EXPECT_EQ(upl16_result.U16[6], 0x78);
|
||||
EXPECT_EQ(upl16_result.U16[7], 0);
|
||||
}
|
||||
|
||||
TEST(GSVector2iTest, TypeConversions)
|
||||
@@ -806,20 +794,28 @@ TEST(GSVector4iTest, Shift64BitOperations)
|
||||
#ifdef GSVECTOR_HAS_SRLV
|
||||
TEST(GSVector4iTest, VariableShifts)
|
||||
{
|
||||
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000);
|
||||
GSVector4i shift_amounts(1, 2, 3, 4);
|
||||
GSVector4i v1(0x1000, 0x2000, 0x4000, 0x8000, 0x1000, 0x2000, 0x4000, 0x8000);
|
||||
GSVector4i shift_amounts(1, 2, 3, 4, 1, 2, 3, 4);
|
||||
|
||||
auto sllv16_result = v1.sllv16(shift_amounts);
|
||||
EXPECT_EQ(sllv16_result.U16[0], 0x2000); // 0x1000 << 1
|
||||
EXPECT_EQ(sllv16_result.U16[1], 0x8000); // 0x2000 << 2
|
||||
EXPECT_EQ(sllv16_result.U16[2], 0x0000); // 0x4000 << 3 (overflow)
|
||||
EXPECT_EQ(sllv16_result.U16[3], 0x0000); // 0x8000 << 4 (overflow)
|
||||
EXPECT_EQ(sllv16_result.U16[4], 0x2000); // 0x1000 << 1
|
||||
EXPECT_EQ(sllv16_result.U16[5], 0x8000); // 0x2000 << 2
|
||||
EXPECT_EQ(sllv16_result.U16[6], 0x0000); // 0x4000 << 3 (overflow)
|
||||
EXPECT_EQ(sllv16_result.U16[7], 0x0000); // 0x8000 << 4 (overflow)
|
||||
|
||||
auto srlv16_result = v1.srlv16(shift_amounts);
|
||||
EXPECT_EQ(srlv16_result.U16[0], 0x0800); // 0x1000 >> 1
|
||||
EXPECT_EQ(srlv16_result.U16[1], 0x0800); // 0x2000 >> 2
|
||||
EXPECT_EQ(srlv16_result.U16[2], 0x0800); // 0x4000 >> 3
|
||||
EXPECT_EQ(srlv16_result.U16[3], 0x0800); // 0x8000 >> 4
|
||||
EXPECT_EQ(srlv16_result.U16[4], 0x0800); // 0x1000 >> 1
|
||||
EXPECT_EQ(srlv16_result.U16[5], 0x0800); // 0x2000 >> 2
|
||||
EXPECT_EQ(srlv16_result.U16[6], 0x0800); // 0x4000 >> 3
|
||||
EXPECT_EQ(srlv16_result.U16[7], 0x0800); // 0x8000 >> 4
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1512,4 +1508,4 @@ TEST(GSVectorTest, Runion_IsCommutative)
|
||||
GSVector4 result2 = rect2.runion(rect1);
|
||||
|
||||
EXPECT_TRUE(result1.eq(result2));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -401,23 +401,23 @@ public:
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector2i sll() const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 16 - i)));
|
||||
return GSVector2i(vreinterpret_s32_s8(vext_s8(vdup_n_s8(0), vreinterpret_s8_s32(v2s), 8 - i)));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector2i sll16() const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_s16(vshl_n_s16(vreinterpret_s16_s32(v2s), i)));
|
||||
return GSVector2i(vreinterpret_s32_u16(vshl_n_u16(vreinterpret_u16_s32(v2s), i)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i sll16(s32 i) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(i))));
|
||||
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vdup_n_s16(i))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i sllv16(const GSVector2i& v) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_s16(vshl_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
|
||||
return GSVector2i(vreinterpret_s32_u16(vshl_u16(vreinterpret_u16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
@@ -459,9 +459,15 @@ public:
|
||||
return GSVector2i(vshl_n_s32(v2s, i));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i sll32(s32 i) const { return GSVector2i(vshl_s32(v2s, vdup_n_s32(i))); }
|
||||
ALWAYS_INLINE GSVector2i sll32(s32 i) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), vdup_n_s32(i))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const { return GSVector2i(vshl_s32(v2s, v.v2s)); }
|
||||
ALWAYS_INLINE GSVector2i sllv32(const GSVector2i& v) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_u32(vshl_u32(vreinterpret_u32_s32(v2s), v.v2s)));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector2i srl32() const
|
||||
@@ -553,6 +559,16 @@ public:
|
||||
return GSVector2i(vreinterpret_s32_u16(vqsub_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_u8(vrhadd_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(v.v2s))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_u16(vrhadd_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(v.v2s))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const
|
||||
{
|
||||
return GSVector2i(vreinterpret_s32_s16(vmul_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s))));
|
||||
@@ -707,7 +723,7 @@ public:
|
||||
return GSVector2i(vset_lane_s32(val, vdup_n_s32(0), 0));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
|
||||
ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }
|
||||
|
||||
template<bool aligned>
|
||||
ALWAYS_INLINE static GSVector2i load(const void* p)
|
||||
@@ -886,7 +902,7 @@ public:
|
||||
template<int mask>
|
||||
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a) const
|
||||
{
|
||||
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1));
|
||||
return GSVector2(__builtin_shufflevector(v2s, a.v2s, (mask & 1) ? 2 : 0, (mask & 2) ? 3 : 1));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector2 blend32(const GSVector2& a, const GSVector2& mask) const
|
||||
@@ -1324,15 +1340,16 @@ public:
|
||||
ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
const int32x4_t acc =
|
||||
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
|
||||
const int32x4_t low =
|
||||
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
const int32x4_t high = vmull_high_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
|
||||
return GSVector4i(vpaddq_s32(low, high));
|
||||
#else
|
||||
// borrowed from sse2neon
|
||||
const int32x4_t low =
|
||||
vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
const int32x4_t high =
|
||||
vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
|
||||
return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
|
||||
vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
|
||||
#endif
|
||||
@@ -1756,17 +1773,17 @@ public:
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector4i sll16() const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
|
||||
return GSVector4i(vreinterpretq_s32_u16(vshlq_n_u16(vreinterpretq_u16_s32(v4s), i)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sll16(s32 i) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
|
||||
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_s16(i))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
|
||||
return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
@@ -1783,7 +1800,7 @@ public:
|
||||
ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(
|
||||
vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
|
||||
vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
@@ -1810,9 +1827,15 @@ public:
|
||||
return GSVector4i(vshlq_n_s32(v4s, i));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
|
||||
ALWAYS_INLINE GSVector4i sll32(s32 i) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(i))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
|
||||
ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), v.v4s)));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector4i srl32() const
|
||||
@@ -1843,17 +1866,17 @@ public:
|
||||
template<int i>
|
||||
ALWAYS_INLINE GSVector4i sll64() const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
|
||||
return GSVector4i(vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(v4s), i)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sll64(s32 i) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s64(i))));
|
||||
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_s64(i))));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
|
||||
{
|
||||
return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
|
||||
return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
|
||||
}
|
||||
|
||||
template<int i>
|
||||
@@ -2771,7 +2794,7 @@ public:
|
||||
|
||||
ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
|
||||
{
|
||||
return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
|
||||
return GSVector4(vcombine_f32(vget_high_f32(a.v4s), vget_high_f32(v4s)));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
|
||||
@@ -3163,7 +3186,7 @@ public:
|
||||
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
return GSVector4(vreinterpretq_f32_u64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||
return GSVector4(vreinterpretq_f32_u64(vcltq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
|
||||
#else
|
||||
GSVector4 ret;
|
||||
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
|
||||
@@ -3230,7 +3253,7 @@ public:
|
||||
ALWAYS_INLINE GSVector4 sqr64() const
|
||||
{
|
||||
#ifdef CPU_ARCH_ARM64
|
||||
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
|
||||
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v4s))));
|
||||
#else
|
||||
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user