Fix unidecode

This commit is contained in:
Ghost 2024-04-04 15:21:40 +02:00
parent 9d1b7536b7
commit 3a99ca2afe
3 changed files with 154 additions and 149 deletions

View File

@ -5,7 +5,7 @@ from Src.Util.console import console, msg
from Src.Util.config import config_manager from Src.Util.config import config_manager
from Src.Util.table import TVShowManager from Src.Util.table import TVShowManager
from Src.Util.message import start_message from Src.Util.message import start_message
from Src.Lib.Unidecode import unidecode from Src.Lib.Unidecode import transliterate
from Src.Lib.FFmpeg.my_m3u8 import Downloader from Src.Lib.FFmpeg.my_m3u8 import Downloader
from .Class import VideoSource from .Class import VideoSource
@ -110,7 +110,7 @@ def donwload_video(tv_name: str, index_season_selected: int, index_episode_selec
episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id episode_id = video_source.obj_episode_manager.episodes[index_episode_selected - 1].id
# Define filename and path for the downloaded video # Define filename and path for the downloaded video
mp4_name = f"{index_episode_selected}_{unidecode(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4" mp4_name = f"{index_episode_selected}_{transliterate(video_source.obj_episode_manager.episodes[index_episode_selected - 1].name)}.mp4"
mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}") mp4_path = os.path.join(ROOT_PATH, SERIES_FOLDER, tv_name, f"S{index_season_selected}")
os.makedirs(mp4_path, exist_ok=True) os.makedirs(mp4_path, exist_ok=True)

View File

@ -1,138 +1,143 @@
# -*- coding: utf-8 -*- # 04.04.24
# vi:tabstop=4:expandtab:sw=4
"""Transliterate Unicode text into plain 7-bit ASCII.
Example usage:
>>> from unidecode import unidecode # Import
>>> unidecode("\u5317\u4EB0") import os
"Bei Jing " import logging
import importlib.util
The transliteration uses a straightforward map, and doesn't have alternatives # Variable
for the same character based on language, position, or anything else. Cache = {}
A standard string object will be returned. If you need bytes, use:
>>> unidecode("Κνωσός").encode("ascii")
b'Knosos'
"""
import warnings
from typing import Dict, Optional, Sequence
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
class UnidecodeError(ValueError): class UnidecodeError(ValueError):
def __init__(self, message: str, index: Optional[int] = None) -> None: pass
"""Raised for Unidecode-related errors.
The index attribute contains the index of the character that caused def transliterate_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
the error. """Transliterates non-ASCII characters in a string to their ASCII counterparts.
"""
super(UnidecodeError, self).__init__(message)
self.index = index
Args:
string (str): The input string containing non-ASCII characters.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: Returns:
"""Transliterate an Unicode object into an ASCII string str: The transliterated string with non-ASCII characters replaced.
>>> unidecode("\u5317\u4EB0")
"Bei Jing "
This function first tries to convert the string using ASCII codec.
If it fails (because of non-ASCII characters), it falls back to
transliteration using the character tables.
This is approx. five times faster if the string only contains ASCII
characters, but slightly slower than unicode_expect_nonascii if
non-ASCII characters are present.
errors specifies what to do with characters that have not been
found in replacement tables. The default is 'ignore' which ignores
the character. 'strict' raises an UnidecodeError. 'replace'
substitutes the character with replace_str (default is '?').
'preserve' keeps the original character.
Note that if 'preserve' is used the returned string might not be
ASCII!
""" """
return _transliterate(string, errors, replace_str)
try: def _get_ascii_representation(char: str) -> str:
bytestring = string.encode('ASCII') """Obtains the ASCII representation of a Unicode character.
except UnicodeEncodeError:
pass
else:
return string
return _unidecode(string, errors, replace_str) Args:
char (str): The Unicode character.
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str: Returns:
"""Transliterate an Unicode object into an ASCII string str: The ASCII representation of the character.
>>> unidecode("\u5317\u4EB0")
"Bei Jing "
See unidecode_expect_ascii.
""" """
return _unidecode(string, errors, replace_str)
unidecode = unidecode_expect_ascii
def _get_repl_str(char: str) -> Optional[str]:
codepoint = ord(char) codepoint = ord(char)
# If the character is ASCII, return it as is
if codepoint < 0x80: if codepoint < 0x80:
# Already ASCII
return str(char) return str(char)
# Ignore characters outside the BMP (Basic Multilingual Plane)
if codepoint > 0xeffff: if codepoint > 0xeffff:
# No data on characters in Private Use Area and above.
return None return None
# Warn about surrogate characters
if 0xd800 <= codepoint <= 0xdfff: if 0xd800 <= codepoint <= 0xdfff:
warnings.warn( "Surrogate character %r will be ignored. " logging.warning("Surrogate character %r will be ignored. "
"You might be using a narrow Python build." % (char,), "You might be using a narrow Python build.", char)
RuntimeWarning, 2)
section = codepoint >> 8 # Chop off the last two hex digits # Calculate section and position
position = codepoint % 256 # Last two hex digits section = codepoint >> 8
position = codepoint % 256
try: try:
# Look up the character in the cache
table = Cache[section] table = Cache[section]
except KeyError: except KeyError:
try: try:
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data']) # Import the module corresponding to the section
module_name = f"x{section:03x}.py"
main = os.path.abspath(os.path.dirname(__file__))
module_path = os.path.join(main, module_name)
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
except ImportError: except ImportError:
# No data on this character # If module import fails, set cache entry to None and return
Cache[section] = None Cache[section] = None
return None return None
Cache[section] = table = mod.data # Update cache with module data
Cache[section] = table = module.data
# Return the ASCII representation if found, otherwise None
if table and len(table) > position: if table and len(table) > position:
return table[position] return table[position]
else: else:
return None return None
def _unidecode(string: str, errors: str, replace_str:str) -> str: def _transliterate(string: str, errors: str, replace_str: str) -> str:
"""Main transliteration function.
Args:
string (str): The input string.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
Returns:
str: The transliterated string.
"""
retval = [] retval = []
for index, char in enumerate(string): for char in string:
repl = _get_repl_str(char) # Get the ASCII representation of the character
ascii_char = _get_ascii_representation(char)
if repl is None: if ascii_char is None:
# Handle errors based on the specified policy
if errors == 'ignore': if errors == 'ignore':
repl = '' ascii_char = ''
elif errors == 'strict': elif errors == 'strict':
raise UnidecodeError('no replacement found for character %r ' logging.error(f'No replacement found for character {char!r}')
'in position %d' % (char, index), index) raise UnidecodeError(f'no replacement found for character {char!r}')
elif errors == 'replace': elif errors == 'replace':
repl = replace_str ascii_char = replace_str
elif errors == 'preserve': elif errors == 'preserve':
repl = char ascii_char = char
else: else:
raise UnidecodeError('invalid value for errors parameter %r' % (errors,)) logging.error(f'Invalid value for errors parameter {errors!r}')
raise UnidecodeError(f'invalid value for errors parameter {errors!r}')
retval.append(repl) # Append the ASCII representation to the result
retval.append(ascii_char)
return ''.join(retval) return ''.join(retval)
def transliterate_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
"""Transliterates non-ASCII characters in a string, expecting ASCII input.
Args:
string (str): The input string containing non-ASCII characters.
errors (str): Specifies the treatment of errors. Can be 'ignore', 'strict', 'replace', or 'preserve'.
replace_str (str): The replacement string used when errors='replace'.
Returns:
str: The transliterated string with non-ASCII characters replaced.
"""
try:
# Check if the string can be encoded as ASCII
string.encode('ASCII')
except UnicodeEncodeError:
# If encoding fails, fall back to transliteration
pass
else:
# If the string is already ASCII, return it as is
return string
# Otherwise, transliterate non-ASCII characters
return _transliterate(string, errors, replace_str)
transliterate = transliterate_expect_ascii

View File

@ -42,95 +42,95 @@ data = (
'', # 0x9d '', # 0x9d
'', # 0x9e '', # 0x9e
'', # 0x9f '', # 0x9f
' ', # 0xa0 ' ', # 0xa0
'!', # 0xa1 '!', # 0xa1
'C/', # 0xa2 'C/', # 0xa2
# Not "GBP" - Pound Sign is used for more than just British Pounds. # Not "GBP" - Pound Sign is used for more than just British Pounds.
'PS', # 0xa3 'PS', # 0xa3
'$?', # 0xa4 '$?', # 0xa4
'Y=', # 0xa5 'Y=', # 0xa5
'|', # 0xa6 '|', # 0xa6
'SS', # 0xa7 'SS', # 0xa7
'"', # 0xa8 '"', # 0xa8
'(c)', # 0xa9 '(c)', # 0xa9
'a', # 0xaa 'a', # 0xaa
'<<', # 0xab '<<', # 0xab
'!', # 0xac '!', # 0xac
'', # 0xad '', # 0xad
'(r)', # 0xae '(r)', # 0xae
'-', # 0xaf '-', # 0xaf
'deg', # 0xb0 'deg', # 0xb0
'+-', # 0xb1 '+-', # 0xb1
# These might be combined with other superscript digits (u+2070 - u+2079) # These might be combined with other superscript digits (u+2070 - u+2079)
'2', # 0xb2 '2', # 0xb2
'3', # 0xb3 '3', # 0xb3
'\'', # 0xb4 '\'', # 0xb4
'u', # 0xb5 'u', # 0xb5
'P', # 0xb6 'P', # 0xb6
'*', # 0xb7 '*', # 0xb7
',', # 0xb8 ',', # 0xb8
'1', # 0xb9 '1', # 0xb9
'o', # 0xba 'o', # 0xba
'>>', # 0xbb '>>', # 0xbb
' 1/4', # 0xbc ' 1/4', # 0xbc
' 1/2', # 0xbd ' 1/2', # 0xbd
' 3/4', # 0xbe ' 3/4', # 0xbe
'?', # 0xbf '?', # 0xbf
'A', # 0xc0 'A', # 0xc0
'A', # 0xc1 'A', # 0xc1
'A', # 0xc2 'A', # 0xc2
'A', # 0xc3 'A', # 0xc3
# Not "AE" - used in languages other than German # Not "AE" - used in languages other than German
'A', # 0xc4 'A', # 0xc4
'A', # 0xc5 'A', # 0xc5
'AE', # 0xc6 'AE', # 0xc6
'C', # 0xc7 'C', # 0xc7
'E', # 0xc8 'E', # 0xc8
'E', # 0xc9 'E', # 0xc9
'E', # 0xca 'E', # 0xca
'E', # 0xcb 'E', # 0xcb
'I', # 0xcc 'I', # 0xcc
'I', # 0xcd 'I', # 0xcd
'I', # 0xce 'I', # 0xce
'I', # 0xcf 'I', # 0xcf
'D', # 0xd0 'D', # 0xd0
'N', # 0xd1 'N', # 0xd1
'O', # 0xd2 'O', # 0xd2
'O', # 0xd3 'O', # 0xd3
'O', # 0xd4 'O', # 0xd4
'O', # 0xd5 'O', # 0xd5
# Not "OE" - used in languages other than German # Not "OE" - used in languages other than German
'O', # 0xd6 'O', # 0xd6
'x', # 0xd7 'x', # 0xd7
'O', # 0xd8 'O', # 0xd8
'U', # 0xd9 'U', # 0xd9
'U', # 0xda 'U', # 0xda
'U', # 0xdb 'U', # 0xdb
# Not "UE" - used in languages other than German # Not "UE" - used in languages other than German
'U', # 0xdc 'U', # 0xdc
'Y', # 0xdd 'Y', # 0xdd
'Th', # 0xde 'Th', # 0xde
'ss', # 0xdf 'ss', # 0xdf
'a', # 0xe0 'a', # 0xe0
'a', # 0xe1 'a', # 0xe1
'a', # 0xe2 'a', # 0xe2
'a', # 0xe3 'a', # 0xe3
# Not "ae" - used in languages other than German # Not "ae" - used in languages other than German
'a', # 0xe4 'a', # 0xe4
'a', # 0xe5 'a', # 0xe5
'ae', # 0xe6 'ae', # 0xe6
'c', # 0xe7 'c', # 0xe7
'e', # 0xe8 'e', # 0xe8
'e', # 0xe9 'e', # 0xe9
@ -160,6 +160,6 @@ data = (
'u', # 0xfc 'u', # 0xfc
'y', # 0xfd 'y', # 0xfd
'th', # 0xfe 'th', # 0xfe
'y', # 0xff 'y', # 0xff
) )