diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index cb0112a4a..46354eb52 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -2754,9 +2754,15 @@ public: } template - ALWAYS_INLINE int extract32() const + ALWAYS_INLINE GSVector4 insert32(float v) const { - return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i); + return GSVector4(vsetq_lane_f32(v, v4s, i)); + } + + template + ALWAYS_INLINE float extract32() const + { + return vgetq_lane_f32(v4s, i); } template diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index ed03504b2..2269ef79e 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -1951,9 +1951,17 @@ public: } template - ALWAYS_INLINE int extract32() const + ALWAYS_INLINE GSVector4 insert32(float v) const { - return I32[i]; + GSVector4 ret(*this); + ret.F32[i] = v; + return ret; + } + + template + ALWAYS_INLINE float extract32() const + { + return F32[i]; } template diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index 615558727..7e9fb6761 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -325,7 +325,7 @@ public: #else constexpr s32 bit1 = ((mask & 2) * 3) << 1; constexpr s32 bit0 = (mask & 1) * 3; - return blend16(v); + return blend16 < bit1 | bit0 > (v); #endif } @@ -1334,7 +1334,7 @@ public: constexpr s32 bit2 = ((mask & 4) * 3) << 2; constexpr s32 bit1 = ((mask & 2) * 3) << 1; constexpr s32 bit0 = (mask & 1) * 3; - return blend16(v); + return blend16 < bit3 | bit2 | bit1 | bit0 > (v); #endif } @@ -2037,17 +2037,17 @@ public: ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); } - NEVER_INLINE float dot(const GSVector4& v) const - { #ifdef CPU_ARCH_SSE41 - return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); + ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); } #else + float dot(const GSVector4& v) const + { __m128 tmp = _mm_mul_ps(m, v.m); tmp = _mm_add_ps(tmp, _mm_movehl_ps(tmp, tmp)); // (x+z, y+w, ..., ...) tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1))); return _mm_cvtss_f32(tmp); -#endif } +#endif ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const { @@ -2135,10 +2135,28 @@ public: } template - ALWAYS_INLINE int extract32() const + ALWAYS_INLINE GSVector4 insert32(float v) const { #ifdef CPU_ARCH_SSE41 - return _mm_extract_ps(m, i); + if constexpr (i == 0) + return GSVector4(_mm_move_ss(m, _mm_load_ss(&v))); + else + return GSVector4(_mm_insert_ps(m, _mm_load_ss(&v), _MM_MK_INSERTPS_NDX(0, i, 0))); +#else + GSVector4 ret(*this); + ret.F32[i] = v; + return ret; +#endif + } + + template + ALWAYS_INLINE float extract32() const + { +#ifdef CPU_ARCH_SSE41 + if constexpr (i == 0) + return _mm_cvtss_f32(m); + else + return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(i, i, i, i))); #else return F32[i]; #endif diff --git a/src/common/intrin.h b/src/common/intrin.h index ae1d13da5..d4ef62a7c 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -119,7 +119,7 @@ ALWAYS_INLINE static void MultiPause() _mm_pause(); _mm_pause(); _mm_pause(); -#elif defined(CPU_ARCH_ARM64) && defined(_MSC_VER) +#elif defined(CPU_ARCH_ARM64) && defined(_MSC_VER) && !defined(__clang__) __isb(_ARM64_BARRIER_SY); __isb(_ARM64_BARRIER_SY); __isb(_ARM64_BARRIER_SY);