mirror of
https://github.com/Arrowar/StreamingCommunity.git
synced 2025-06-06 19:45:24 +00:00
152 lines
4.8 KiB
Python
152 lines
4.8 KiB
Python
# 04.04.24
|
|
|
|
import os
|
|
import logging
|
|
import importlib.util
|
|
|
|
|
|
# Variable
|
|
Cache = {}
|
|
|
|
|
|
class UnidecodeError(ValueError):
|
|
pass
|
|
|
|
|
|
def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
|
"""Transliterates non-ASCII characters in a string to their ASCII counterparts.
|
|
|
|
Args:
|
|
string (str): The input string containing non-ASCII characters.
|
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
|
replace_str (str): The replacement string used when errors='replace'.
|
|
|
|
Returns:
|
|
str: The transliterated string with non-ASCII characters replaced.
|
|
"""
|
|
return _transliterate(string, errors, replace_str)
|
|
|
|
|
|
def _get_ascii_representation(char: str) -> str:
|
|
"""Obtains the ASCII representation of a Unicode character.
|
|
|
|
Args:
|
|
char (str): The Unicode character.
|
|
|
|
Returns:
|
|
str: The ASCII representation of the character.
|
|
"""
|
|
codepoint = ord(char)
|
|
|
|
# If the character is ASCII, return it as is
|
|
if codepoint < 0x80:
|
|
return str(char)
|
|
|
|
# Ignore characters outside the BMP (Basic Multilingual Plane)
|
|
if codepoint > 0xeffff:
|
|
return None
|
|
|
|
# Warn about surrogate characters
|
|
if 0xd800 <= codepoint <= 0xdfff:
|
|
logging.warning("Surrogate character %r will be ignored. "
|
|
"You might be using a narrow Python build.", char)
|
|
|
|
# Calculate section and position
|
|
section = codepoint >> 8
|
|
position = codepoint % 256
|
|
|
|
try:
|
|
# Look up the character in the cache
|
|
table = Cache[section]
|
|
|
|
except KeyError:
|
|
try:
|
|
# Import the module corresponding to the section
|
|
module_name = f"x{section:03x}.py"
|
|
main = os.path.abspath(os.path.dirname(__file__))
|
|
module_path = os.path.join(main, module_name)
|
|
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
|
|
except ImportError:
|
|
# If module import fails, set cache entry to None and return
|
|
Cache[section] = None
|
|
return None
|
|
|
|
# Update cache with module data
|
|
Cache[section] = table = module.data
|
|
|
|
# Return the ASCII representation if found, otherwise None
|
|
if table and len(table) > position:
|
|
return table[position]
|
|
else:
|
|
return None
|
|
|
|
|
|
def _transliterate(string: str, errors: str, replace_str: str) -> str:
|
|
"""Main transliteration function.
|
|
|
|
Args:
|
|
string (str): The input string.
|
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
|
replace_str (str): The replacement string used when errors='replace'.
|
|
|
|
Returns:
|
|
str: The transliterated string.
|
|
"""
|
|
retval = []
|
|
|
|
for char in string:
|
|
# Get the ASCII representation of the character
|
|
ascii_char = _get_ascii_representation(char)
|
|
|
|
if ascii_char is None:
|
|
# Handle errors based on the specified policy
|
|
if errors == 'ignore':
|
|
ascii_char = ''
|
|
elif errors == 'strict':
|
|
logging.error(f'No replacement found for character {char!r}')
|
|
raise UnidecodeError(f'no replacement found for character {char!r}')
|
|
elif errors == 'replace':
|
|
ascii_char = replace_str
|
|
elif errors == 'preserve':
|
|
ascii_char = char
|
|
else:
|
|
logging.error(f'Invalid value for errors parameter {errors!r}')
|
|
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
|
|
|
|
# Append the ASCII representation to the result
|
|
retval.append(ascii_char)
|
|
|
|
return ''.join(retval)
|
|
|
|
|
|
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
|
|
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
|
|
|
|
Args:
|
|
string (str): The input string containing non-ASCII characters.
|
|
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
|
|
replace_str (str): The replacement string used when errors='replace'.
|
|
|
|
Returns:
|
|
str: The transliterated string with non-ASCII characters replaced.
|
|
"""
|
|
try:
|
|
# Check if the string can be encoded as ASCII
|
|
string.encode('ASCII')
|
|
except UnicodeEncodeError:
|
|
# If encoding fails, fall back to transliteration
|
|
pass
|
|
else:
|
|
# If the string is already ASCII, return it as is
|
|
return string
|
|
|
|
# Otherwise, transliterate non-ASCII characters
|
|
return _transliterate(string, errors, replace_str)
|
|
|
|
|
|
# Out
|
|
transliterate = transliterate_expect_ascii
|