mirror of
https://github.com/alexpasmantier/television.git
synced 2025-06-06 19:45:23 +00:00
feat(i18n): improve support for non-western scripts (#305)
This PR adds - support for katakana and hiragana - support for hangul - support for thai - support for devanagari - support for emojis Some special characters such as typographic parentheses for the Japanese language for example don't seem to be covered by the Unicode ranges I'm adding in this pull request. To ensure complete support, additional Unicode ranges should be added by native speakers when possible to avoid errors. Fixes #299
This commit is contained in:
parent
cb565d667e
commit
37b71b7a88
@ -225,6 +225,23 @@ impl Default for ReplaceNonPrintableConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_emoji(ch: char) -> bool {
|
||||||
|
[
|
||||||
|
// emoticons
|
||||||
|
'\u{1F600}'..='\u{1F64F}',
|
||||||
|
// misc. symbols and pictograms
|
||||||
|
'\u{1F300}'..='\u{1F5FF}',
|
||||||
|
// transports / map
|
||||||
|
'\u{1F680}'..='\u{1F6FF}',
|
||||||
|
// additional symbols and pictograms
|
||||||
|
'\u{1F900}'..='\u{1F9FF}',
|
||||||
|
// flags
|
||||||
|
'\u{1F1E6}'..='\u{1F1FF}',
|
||||||
|
]
|
||||||
|
.iter()
|
||||||
|
.any(|range| range.contains(&ch))
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::missing_panics_doc)]
|
#[allow(clippy::missing_panics_doc)]
|
||||||
/// Replaces non-printable characters in the given byte slice with default printable characters.
|
/// Replaces non-printable characters in the given byte slice with default printable characters.
|
||||||
///
|
///
|
||||||
@ -267,7 +284,6 @@ pub fn replace_non_printable(
|
|||||||
offsets.push(cumulative_offset);
|
offsets.push(cumulative_offset);
|
||||||
if let Some((chr, skip_ahead)) = try_parse_utf8_char(&input[idx..]) {
|
if let Some((chr, skip_ahead)) = try_parse_utf8_char(&input[idx..]) {
|
||||||
idx += skip_ahead;
|
idx += skip_ahead;
|
||||||
|
|
||||||
match chr {
|
match chr {
|
||||||
// tab
|
// tab
|
||||||
TAB_CHARACTER if config.replace_tab => {
|
TAB_CHARACTER if config.replace_tab => {
|
||||||
@ -291,9 +307,30 @@ pub fn replace_non_printable(
|
|||||||
output.push(*NULL_SYMBOL);
|
output.push(*NULL_SYMBOL);
|
||||||
}
|
}
|
||||||
// CJK Unified Ideographs
|
// CJK Unified Ideographs
|
||||||
|
// ex: 解
|
||||||
c if ('\u{4E00}'..='\u{9FFF}').contains(&c) => {
|
c if ('\u{4E00}'..='\u{9FFF}').contains(&c) => {
|
||||||
output.push(c);
|
output.push(c);
|
||||||
}
|
}
|
||||||
|
// Korean: Hangul syllables
|
||||||
|
// ex: 가 or 한
|
||||||
|
c if ('\u{AC00}'..='\u{D7AF}').contains(&c) => {
|
||||||
|
output.push(c);
|
||||||
|
}
|
||||||
|
// some emojis
|
||||||
|
// ex: 😀
|
||||||
|
c if is_emoji(c) => {
|
||||||
|
output.push(c);
|
||||||
|
}
|
||||||
|
// Japanese (contiguous ranges for katakana and hiragana)
|
||||||
|
// ex: katakana -> ア and hiragana -> あ
|
||||||
|
c if ('\u{3040}'..='\u{30FF}').contains(&c) => {
|
||||||
|
output.push(c);
|
||||||
|
}
|
||||||
|
// Thai
|
||||||
|
// ex: ส or ดี
|
||||||
|
c if ('\u{0E00}'..='\u{0E7F}').contains(&c) => output.push(c),
|
||||||
|
// Devanagari (most common Indic script)
|
||||||
|
c if ('\u{0900}'..='\u{097F}').contains(&c) => output.push(c),
|
||||||
// Nerd fonts
|
// Nerd fonts
|
||||||
c if ALL_NF_RANGES.iter().any(|r| r.contains(&c)) => {
|
c if ALL_NF_RANGES.iter().any(|r| r.contains(&c)) => {
|
||||||
output.push(c);
|
output.push(c);
|
||||||
@ -653,6 +690,65 @@ mod tests {
|
|||||||
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1]);
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_cjk_characters() {
|
||||||
|
let input = "你好,世界!".as_bytes();
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "你好,世界!");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_thai_characters() {
|
||||||
|
let input = "สวัสดี!".as_bytes(); // สวัสดี is 6 characters + !
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "สวัสดี!");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_emoji_characters() {
|
||||||
|
let input = "Hello 🌍!".as_bytes();
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "Hello 🌍!");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_devanagari_characters() {
|
||||||
|
let input = "नमस्ते".as_bytes(); // नमस्ते is 6 characters
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "नमस्ते");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_hiragana_characters() {
|
||||||
|
let input = "こんにちは".as_bytes();
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "こんにちは");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_katakana_characters() {
|
||||||
|
let input = "コンニチハ".as_bytes();
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "コンニチハ");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_korean_characters() {
|
||||||
|
let input = "안녕하세요!".as_bytes();
|
||||||
|
let config = ReplaceNonPrintableConfig::default();
|
||||||
|
let (output, offsets) = replace_non_printable(input, &config);
|
||||||
|
assert_eq!(output, "안녕하세요!");
|
||||||
|
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0]);
|
||||||
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_replace_non_printable_no_range_changes() {
|
fn test_replace_non_printable_no_range_changes() {
|
||||||
let input = b"Hello,\x00World!";
|
let input = b"Hello,\x00World!";
|
||||||
|
Loading…
x
Reference in New Issue
Block a user