StringUtil: Add DecodeUTF16String() and BE variants

This commit is contained in:
Stenzek 2025-05-17 20:25:45 +10:00
parent 17dfb95d6a
commit e3c0eed3df
No known key found for this signature in database
2 changed files with 64 additions and 4 deletions

View File

@ -1,8 +1,9 @@
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com> // SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0 // SPDX-License-Identifier: CC-BY-NC-ND-4.0
#include "string_util.h" #include "string_util.h"
#include "assert.h" #include "assert.h"
#include "bitutils.h"
#include <cctype> #include <cctype>
#include <codecvt> #include <codecvt>
@ -20,6 +21,16 @@
#include "windows_headers.h" #include "windows_headers.h"
#endif #endif
namespace StringUtil {
template<bool swap>
static size_t DecodeUTF16Impl(const void* bytes, size_t pos, size_t size, char32_t* ch);
template<bool swap>
static std::string DecodeUTF16StringImpl(const void* bytes, size_t size);
} // namespace StringUtil
bool StringUtil::WildcardMatch(const char* subject, const char* mask, bool case_sensitive /*= true*/) bool StringUtil::WildcardMatch(const char* subject, const char* mask, bool case_sensitive /*= true*/)
{ {
if (case_sensitive) if (case_sensitive)
@ -629,17 +640,20 @@ size_t StringUtil::EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, ch
} }
} }
size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, char32_t* ch) template<bool swap>
size_t StringUtil::DecodeUTF16Impl(const void* bytes, size_t pos, size_t size, char32_t* ch)
{ {
const u8* const utf16_bytes = std::assume_aligned<sizeof(u16)>(static_cast<const u8*>(bytes)) + pos * sizeof(u16); const u8* const utf16_bytes = std::assume_aligned<sizeof(u16)>(static_cast<const u8*>(bytes)) + pos * sizeof(u16);
u16 high; u16 high;
std::memcpy(&high, utf16_bytes, sizeof(high)); std::memcpy(&high, utf16_bytes, sizeof(high));
if constexpr (swap)
high = ByteSwap(high);
// High surrogate? // High surrogate?
if (high >= 0xD800 && high <= 0xDBFF) [[unlikely]] if (high >= 0xD800 && high <= 0xDBFF) [[unlikely]]
{ {
if (length < 2) [[unlikely]] if ((size - pos) < 2) [[unlikely]]
{ {
// Missing low surrogate. // Missing low surrogate.
*ch = UNICODE_REPLACEMENT_CHARACTER; *ch = UNICODE_REPLACEMENT_CHARACTER;
@ -648,6 +662,9 @@ size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, cha
u16 low; u16 low;
std::memcpy(&low, utf16_bytes + sizeof(u16), sizeof(low)); std::memcpy(&low, utf16_bytes + sizeof(u16), sizeof(low));
if constexpr (swap)
low = ByteSwap(low);
if (low >= 0xDC00 && low <= 0xDFFF) [[likely]] if (low >= 0xDC00 && low <= 0xDFFF) [[likely]]
{ {
*ch = static_cast<char32_t>(((static_cast<u32>(high) - 0xD800u) << 10) + ((static_cast<u32>(low) - 0xDC00)) + *ch = static_cast<char32_t>(((static_cast<u32>(high) - 0xD800u) << 10) + ((static_cast<u32>(low) - 0xDC00)) +
@ -669,6 +686,44 @@ size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, cha
} }
} }
template<bool swap>
std::string StringUtil::DecodeUTF16StringImpl(const void* bytes, size_t size)
{
std::string dest;
dest.reserve(size);
const size_t u16_size = size / 2;
for (size_t pos = 0; pos < u16_size;)
{
char32_t codepoint;
const size_t byte_len = DecodeUTF16Impl<swap>(bytes, pos, u16_size, &codepoint);
StringUtil::EncodeAndAppendUTF8(dest, codepoint);
pos += byte_len;
}
return dest;
}
size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint)
{
return DecodeUTF16Impl<false>(bytes, pos, size, codepoint);
}
size_t StringUtil::DecodeUTF16BE(const void* bytes, size_t pos, size_t size, char32_t* codepoint)
{
return DecodeUTF16Impl<true>(bytes, pos, size, codepoint);
}
std::string StringUtil::DecodeUTF16String(const void* bytes, size_t size)
{
return DecodeUTF16StringImpl<false>(bytes, size);
}
std::string StringUtil::DecodeUTF16BEString(const void* bytes, size_t size)
{
return DecodeUTF16StringImpl<true>(bytes, size);
}
std::string StringUtil::Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis /*= "..."*/) std::string StringUtil::Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis /*= "..."*/)
{ {
std::string ret; std::string ret;

View File

@ -461,8 +461,13 @@ size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch);
size_t EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint); size_t EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint);
/// Decodes UTF-16 to a single unicode codepoint. /// Decodes UTF-16 to a single unicode codepoint.
/// Returns the number of bytes the codepoint took in the original string. /// Returns the number of 16-bit units the codepoint took in the original string.
size_t DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint); size_t DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint);
size_t DecodeUTF16BE(const void* bytes, size_t pos, size_t size, char32_t* codepoint);
/// Decodes a UTF-16 string to a UTF-8 string.
std::string DecodeUTF16String(const void* bytes, size_t size);
std::string DecodeUTF16BEString(const void* bytes, size_t size);
// Replaces the end of a string with ellipsis if it exceeds the specified length. // Replaces the end of a string with ellipsis if it exceeds the specified length.
std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "..."); std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "...");