diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 59cf34a95..ccc506c86 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -1,8 +1,9 @@ -// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin // SPDX-License-Identifier: CC-BY-NC-ND-4.0 #include "string_util.h" #include "assert.h" +#include "bitutils.h" #include #include @@ -20,6 +21,16 @@ #include "windows_headers.h" #endif +namespace StringUtil { + +template +static size_t DecodeUTF16Impl(const void* bytes, size_t pos, size_t size, char32_t* ch); + +template +static std::string DecodeUTF16StringImpl(const void* bytes, size_t size); + +} // namespace StringUtil + bool StringUtil::WildcardMatch(const char* subject, const char* mask, bool case_sensitive /*= true*/) { if (case_sensitive) @@ -629,17 +640,20 @@ size_t StringUtil::EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, ch } } -size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, char32_t* ch) +template +size_t StringUtil::DecodeUTF16Impl(const void* bytes, size_t pos, size_t size, char32_t* ch) { const u8* const utf16_bytes = std::assume_aligned(static_cast(bytes)) + pos * sizeof(u16); u16 high; std::memcpy(&high, utf16_bytes, sizeof(high)); + if constexpr (swap) + high = ByteSwap(high); // High surrogate? if (high >= 0xD800 && high <= 0xDBFF) [[unlikely]] { - if (length < 2) [[unlikely]] + if ((size - pos) < 2) [[unlikely]] { // Missing low surrogate. *ch = UNICODE_REPLACEMENT_CHARACTER; @@ -648,6 +662,9 @@ size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, cha u16 low; std::memcpy(&low, utf16_bytes + sizeof(u16), sizeof(low)); + if constexpr (swap) + low = ByteSwap(low); + if (low >= 0xDC00 && low <= 0xDFFF) [[likely]] { *ch = static_cast(((static_cast(high) - 0xD800u) << 10) + ((static_cast(low) - 0xDC00)) + @@ -669,6 +686,44 @@ size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, cha } } +template +std::string StringUtil::DecodeUTF16StringImpl(const void* bytes, size_t size) +{ + std::string dest; + dest.reserve(size); + + const size_t u16_size = size / 2; + for (size_t pos = 0; pos < u16_size;) + { + char32_t codepoint; + const size_t byte_len = DecodeUTF16Impl(bytes, pos, u16_size, &codepoint); + StringUtil::EncodeAndAppendUTF8(dest, codepoint); + pos += byte_len; + } + + return dest; +} + +size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint) +{ + return DecodeUTF16Impl(bytes, pos, size, codepoint); +} + +size_t StringUtil::DecodeUTF16BE(const void* bytes, size_t pos, size_t size, char32_t* codepoint) +{ + return DecodeUTF16Impl(bytes, pos, size, codepoint); +} + +std::string StringUtil::DecodeUTF16String(const void* bytes, size_t size) +{ + return DecodeUTF16StringImpl(bytes, size); +} + +std::string StringUtil::DecodeUTF16BEString(const void* bytes, size_t size) +{ + return DecodeUTF16StringImpl(bytes, size); +} + std::string StringUtil::Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis /*= "..."*/) { std::string ret; diff --git a/src/common/string_util.h b/src/common/string_util.h index 5cc005ed0..66322eb1d 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -461,8 +461,13 @@ size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch); size_t EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint); /// Decodes UTF-16 to a single unicode codepoint. -/// Returns the number of bytes the codepoint took in the original string. +/// Returns the number of 16-bit units the codepoint took in the original string. size_t DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint); +size_t DecodeUTF16BE(const void* bytes, size_t pos, size_t size, char32_t* codepoint); + +/// Decodes a UTF-16 string to a UTF-8 string. +std::string DecodeUTF16String(const void* bytes, size_t size); +std::string DecodeUTF16BEString(const void* bytes, size_t size); // Replaces the end of a string with ellipsis if it exceeds the specified length. std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "...");