diff --git a/src/common/intrin.h b/src/common/intrin.h index b04236719..ae1d13da5 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -44,6 +44,17 @@ #include // alloca #endif +/// Helper to disable loop vectorization. +#if defined(__clang__) +#define DONT_VECTORIZE_THIS_LOOP _Pragma("clang loop vectorize(disable)") +#elif defined(_MSC_VER) +#define DONT_VECTORIZE_THIS_LOOP __pragma(loop(no_vector)) +#elif defined(__GNUC__) +#define DONT_VECTORIZE_THIS_LOOP _Pragma("GCC novector") +#else +#define DONT_VECTORIZE_THIS_LOOP +#endif + /// Only currently using 128-bit vectors at max. static constexpr u32 VECTOR_ALIGNMENT = 16; diff --git a/src/util/image.cpp b/src/util/image.cpp index 1ff83d603..d645a352c 100644 --- a/src/util/image.cpp +++ b/src/util/image.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin // SPDX-License-Identifier: CC-BY-NC-ND-4.0 #include "image.h" @@ -583,12 +583,46 @@ std::optional Image::ConvertToRGBA8(Error* error) const case ImageFormat::RGB565: { ret = Image(m_width, m_height, ImageFormat::RGBA8); + + constexpr u32 pixels_per_vec = 8; + [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec); + for (u32 y = 0; y < m_height; y++) { const u8* pixels_in = GetRowPixels(y); u8* pixels_out = ret->GetRowPixels(y); + u32 x = 0; - for (u32 x = 0; x < m_width; x++) +#ifdef CPU_ARCH_SIMD + for (; x < aligned_width; x += pixels_per_vec) + { + GSVector4i rgb565 = GSVector4i::load(pixels_in); + pixels_in += sizeof(u16) * pixels_per_vec; + + GSVector4i r = rgb565.srl16<11>(); + r = r.sll16<3>() | r.sll16<13>().srl16<13>(); + + GSVector4i g = rgb565.sll16<5>().srl16<10>(); + g = g.sll16<2>() | g.sll16<14>().srl16<14>(); + + GSVector4i b = rgb565.sll16<11>().srl16<11>(); + b = b.sll16<3>() | b.sll16<13>().srl16<13>(); + + const GSVector4i low = + r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000); + const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() | + b.uph64().u16to32().sll32<16>() | GSVector4i::cxpr(0xFF000000); + + GSVector4i::store(pixels_out, low); + pixels_out += sizeof(GSVector4i); + + GSVector4i::store(pixels_out, high); + pixels_out += sizeof(GSVector4i); + } +#endif + + DONT_VECTORIZE_THIS_LOOP + for (; x < m_width; x++) { // RGB565 -> RGBA8 u16 pixel_in; @@ -609,12 +643,48 @@ std::optional Image::ConvertToRGBA8(Error* error) const case ImageFormat::RGB5A1: { ret = Image(m_width, m_height, ImageFormat::RGBA8); + + constexpr u32 pixels_per_vec = 8; + [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec); + for (u32 y = 0; y < m_height; y++) { const u8* pixels_in = GetRowPixels(y); u8* pixels_out = ret->GetRowPixels(y); + u32 x = 0; - for (u32 x = 0; x < m_width; x++) +#ifdef CPU_ARCH_SIMD + for (; x < aligned_width; x += pixels_per_vec) + { + GSVector4i rgb5a1 = GSVector4i::load(pixels_in); + pixels_in += sizeof(u16) * pixels_per_vec; + + GSVector4i r = rgb5a1.sll16<1>().srl16<11>(); + r = r.sll16<3>() | r.sll16<13>().srl16<13>(); + + GSVector4i g = rgb5a1.sll16<6>().srl16<11>(); + g = g.sll16<3>() | g.sll16<13>().srl16<13>(); + + GSVector4i b = rgb5a1.sll16<11>().srl16<11>(); + b = b.sll16<3>() | b.sll16<13>().srl16<13>(); + + GSVector4i a = rgb5a1.sra16<7>().srl16<8>(); + + const GSVector4i low = + r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>(); + const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() | + b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>(); + + GSVector4i::store(pixels_out, low); + pixels_out += sizeof(GSVector4i); + + GSVector4i::store(pixels_out, high); + pixels_out += sizeof(GSVector4i); + } +#endif + + DONT_VECTORIZE_THIS_LOOP + for (; x < m_width; x++) { // RGB5A1 -> RGBA8 u16 pixel_in; @@ -633,6 +703,69 @@ std::optional Image::ConvertToRGBA8(Error* error) const } break; + case ImageFormat::A1BGR5: + { + ret = Image(m_width, m_height, ImageFormat::RGBA8); + + constexpr u32 pixels_per_vec = 8; + [[maybe_unused]] const u32 aligned_width = Common::AlignDownPow2(m_width, pixels_per_vec); + + for (u32 y = 0; y < m_height; y++) + { + const u8* pixels_in = GetRowPixels(y); + u8* pixels_out = ret->GetRowPixels(y); + u32 x = 0; + +#ifdef CPU_ARCH_SIMD + for (; x < aligned_width; x += pixels_per_vec) + { + GSVector4i a1bgr5 = GSVector4i::load(pixels_in); + pixels_in += sizeof(u16) * pixels_per_vec; + + GSVector4i r = a1bgr5.srl16<11>(); + r = r.sll16<3>() | r.sll16<13>().srl16<13>(); + + GSVector4i g = a1bgr5.sll16<5>().srl16<11>(); + g = g.sll16<3>() | g.sll16<13>().srl16<13>(); + + GSVector4i b = a1bgr5.sll16<10>().srl16<11>(); + b = b.sll16<3>() | b.sll16<13>().srl16<13>(); + + GSVector4i a = a1bgr5.sll16<15>().sra16<7>().srl16<8>(); + + const GSVector4i low = + r.u16to32() | g.u16to32().sll32<8>() | b.u16to32().sll32<16>() | a.u16to32().sll32<24>(); + const GSVector4i high = r.uph64().u16to32() | g.uph64().u16to32().sll32<8>() | + b.uph64().u16to32().sll32<16>() | a.uph64().u16to32().sll32<24>(); + + GSVector4i::store(pixels_out, low); + pixels_out += sizeof(GSVector4i); + + GSVector4i::store(pixels_out, high); + pixels_out += sizeof(GSVector4i); + } +#endif + + DONT_VECTORIZE_THIS_LOOP + for (; x < m_width; x++) + { + // RGB5A1 -> RGBA8 + u16 pixel_in; + std::memcpy(&pixel_in, pixels_in, sizeof(u16)); + pixels_in += sizeof(u16); + const u8 a1 = Truncate8(pixel_in & 0x01); + const u8 r5 = Truncate8((pixel_in >> 11) & 0x1F); + const u8 g6 = Truncate8((pixel_in >> 6) & 0x1F); + const u8 b5 = Truncate8((pixel_in >> 1) & 0x1F); + const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) | + (ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (a1 ? 0xFF000000u : 0u); + std::memcpy(pixels_out, &rgba8, sizeof(u32)); + pixels_out += sizeof(u32); + } + } + } + break; + case ImageFormat::BGR8: { ret = Image(m_width, m_height, ImageFormat::RGBA8); diff --git a/src/util/image.h b/src/util/image.h index bf02899d7..239cc5f8a 100644 --- a/src/util/image.h +++ b/src/util/image.h @@ -1,10 +1,11 @@ -// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin // SPDX-License-Identifier: CC-BY-NC-ND-4.0 #pragma once #include "common/align.h" #include "common/heap_array.h" +#include "common/intrin.h" #include "common/types.h" #include @@ -60,12 +61,13 @@ public: ALWAYS_INLINE u32 GetHeight() const { return m_height; } ALWAYS_INLINE u32 GetPitch() const { return m_pitch; } ALWAYS_INLINE ImageFormat GetFormat() const { return m_format; } - ALWAYS_INLINE const u8* GetPixels() const { return m_pixels.get(); } - ALWAYS_INLINE u8* GetPixels() { return m_pixels.get(); } - ALWAYS_INLINE const u8* GetRowPixels(u32 y) const { return &m_pixels[y * m_pitch]; } - ALWAYS_INLINE u8* GetRowPixels(u32 y) { return &m_pixels[y * m_pitch]; } - // ALWAYS_INLINE void SetPixel(u32 x, u32 y, PixelType pixel) { m_pixels[y * m_width + x] = pixel; } - // ALWAYS_INLINE PixelType GetPixel(u32 x, u32 y) const { return m_pixels[y * m_width + x]; } + ALWAYS_INLINE const u8* GetPixels() const { return std::assume_aligned(m_pixels.get()); } + ALWAYS_INLINE u8* GetPixels() { return std::assume_aligned(m_pixels.get()); } + ALWAYS_INLINE const u8* GetRowPixels(u32 y) const + { + return std::assume_aligned(&m_pixels[y * m_pitch]); + } + ALWAYS_INLINE u8* GetRowPixels(u32 y) { return std::assume_aligned(&m_pixels[y * m_pitch]); } u32 GetBlocksWide() const; u32 GetBlocksHigh() const;